{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 初始化运行头文件、函数和参数等"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['/Users/wangnaixuan/Documents/Gitee/PS_baseline/notebooks', '/usr/local/anaconda3/envs/nash/lib/python37.zip', '/usr/local/anaconda3/envs/nash/lib/python3.7', '/usr/local/anaconda3/envs/nash/lib/python3.7/lib-dynload', '', '/usr/local/anaconda3/envs/nash/lib/python3.7/site-packages', '/usr/local/anaconda3/envs/nash/lib/python3.7/site-packages/IPython/extensions', '/Users/wangnaixuan/.ipython', '/opt/gurobi201/linux32/lib/python2.5', '/Users/wangnaixuan/Documents/Gitee/PS_baseline']\n"
     ]
    }
   ],
   "source": [
    "import sys\n",
    "import os\n",
    "import pm4py\n",
    "import pandas as pd\n",
    "import re\n",
    "import random\n",
    "import numpy as np\n",
    "from sklearn.preprocessing import OneHotEncoder\n",
    "import datetime\n",
    "from keras.preprocessing import sequence\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "sys.path.append(os.path.abspath(os.path.join(os.getcwd(), \"..\")))\n",
    "print(sys.path)\n",
    "\n",
    "def display_dataset(data_frame):\n",
    "    trace_lengths = []\n",
    "    true_values = []\n",
    "    for cid in list(set(data_frame[\"case:concept:name\"].values)):\n",
    "        trace_lengths.append(data_frame[data_frame[\"case:concept:name\"]==cid].shape[0])\n",
    "        true_values.append(df[df[\"case:concept:name\"]==cid].iloc[0]['total_time_true'])\n",
    "    print(len(trace_lengths), len(true_values))\n",
    "    print(min(trace_lengths), max(trace_lengths), min(true_values), max(true_values))\n",
    "\n",
    "    df_lens = pd.DataFrame({\"trace length\": trace_lengths})\n",
    "    df_truevs = pd.DataFrame({\"true values\": true_values})\n",
    "    fig = plt.figure(figsize=(12, 6))\n",
    "    ax1 = fig.add_subplot(121)\n",
    "    df_lens.hist(bins=len(set(trace_lengths)), ax=ax1)\n",
    "    plt.grid()\n",
    "    ax2 = fig.add_subplot(122)\n",
    "    df_truevs.hist(bins=len(set(true_values)), ax=ax2)\n",
    "    plt.grid()\n",
    "    \n",
    "def sample_data(data_frame, all_cids, num_train, num_test):\n",
    "    sample_cids = random.sample(all_cids, num_train+num_test)\n",
    "    train_cids = random.sample(sample_cids, num_train)\n",
    "    test_cids = [x for x in sample_cids if x not in train_cids]\n",
    "    print(len(train_cids), len(test_cids))\n",
    "    \n",
    "    df_samples = pd.DataFrame()\n",
    "    for cid in sample_cids:\n",
    "        this_df = data_frame[data_frame[\"case:concept:name\"]==cid]\n",
    "        df_samples = df_samples.append(this_df)\n",
    "    print(df_samples.shape)\n",
    "    return df_samples, train_cids, test_cids"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### BPIC 2013数据预处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b736353c9e2545939cc86d7c1d48316f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(HTML(value='parsing log, completed traces :: '), FloatProgress(value=0.0, max=7554.0), HTML(val…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "0\n",
      "10000\n",
      "20000\n",
      "30000\n",
      "40000\n",
      "50000\n",
      "(56733, 8)\n"
     ]
    }
   ],
   "source": [
    "from data_cleaner.data_preprocess import BPIC2013Preprocess\n",
    "\n",
    "df = BPIC2013Preprocess().preprocess()\n",
    "print(df.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "jupyter": {
     "source_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train_data_size:  3435 , test_data_size:  1473\n"
     ]
    }
   ],
   "source": [
    "def prepare_train_data(data_processed):\n",
    "    all_case_ids = set(data_processed['case:concept:name'].values)\n",
    "    num_trainset = len(all_case_ids) * 7 // 10\n",
    "    random.seed(0)\n",
    "    train_cids = random.sample(all_case_ids, num_trainset)\n",
    "    train_cids_set = set(train_cids)\n",
    "    test_cids_set = all_case_ids - train_cids_set\n",
    "    print('train_data_size: ', len(train_cids_set), ', test_data_size: ', len(test_cids_set))\n",
    "\n",
    "    resource_country_names = np.array(data_processed['resource country'].values)\n",
    "    resource_country_names = np.reshape(resource_country_names, (resource_country_names.shape[0], 1))\n",
    "    ohe_rcn = OneHotEncoder(sparse=False)\n",
    "    ohe_rcn.fit(resource_country_names)\n",
    "    \n",
    "    organization_country_names = np.array(data_processed['organization country'].values)\n",
    "    organization_country_names = np.reshape(organization_country_names, (organization_country_names.shape[0], 1))\n",
    "    ohe_ocn = OneHotEncoder(sparse=False)\n",
    "    ohe_ocn.fit(organization_country_names)\n",
    "    \n",
    "    organization_involved_names = np.array(data_processed['organization involved'].values)\n",
    "    organization_involved_names = np.reshape(organization_involved_names, (organization_involved_names.shape[0], 1))\n",
    "    ohe_oin = OneHotEncoder(sparse=False)\n",
    "    ohe_oin.fit(organization_involved_names)\n",
    "    \n",
    "    activity_names = np.array(data_processed['concept:name'].values)\n",
    "    activity_names = np.reshape(activity_names, (activity_names.shape[0], 1))\n",
    "    ohe_act = OneHotEncoder(sparse=False)\n",
    "    ohe_act.fit(activity_names)\n",
    "\n",
    "    impact_names = np.array(data_processed['impact'].values)\n",
    "    impact_names = np.reshape(impact_names, (impact_names.shape[0], 1))\n",
    "    ohe_in = OneHotEncoder(sparse=False)\n",
    "    ohe_in.fit(impact_names)\n",
    "    \n",
    "    transition_names = np.array(data_processed['lifecycle:transition'].values)\n",
    "    transition_names = np.reshape(transition_names, (transition_names.shape[0], 1))\n",
    "    ohe_tn = OneHotEncoder(sparse=False)\n",
    "    ohe_tn.fit(transition_names)\n",
    "\n",
    "    def generate_data(cid_set):\n",
    "        data_set = []\n",
    "        trace_length = 0\n",
    "        for cid in cid_set:\n",
    "            thisdf = data_processed[data_processed['case:concept:name'] == cid]\n",
    "            trace_length = max(trace_length, thisdf.shape[0])\n",
    "            tmpdata = []\n",
    "            \n",
    "            start_time = datetime.datetime.strptime(thisdf.iloc[0]['time:timestamp_short'], '%Y-%m-%d %H:%M:%S')\n",
    "            end_time = datetime.datetime.strptime(thisdf.iloc[-1]['time:timestamp_short'], '%Y-%m-%d %H:%M:%S')\n",
    "            last_time = start_time\n",
    "\n",
    "            for i in range(thisdf.shape[0]):\n",
    "                row = [int(thisdf.iloc[i]['case:concept:name'])]\n",
    "                \n",
    "                event_dt = datetime.datetime.strptime(thisdf.iloc[i]['time:timestamp_short'], '%Y-%m-%d %H:%M:%S')\n",
    "                midnight_time = event_dt.replace(hour=0, minute=0, second=0, microsecond=0)\n",
    "                \n",
    "                resource_country_name = np.array(thisdf.iloc[i]['resource country'])\n",
    "                resource_country_name = np.reshape(resource_country_name, (-1, 1))\n",
    "                \n",
    "                organization_country_name = np.array(thisdf.iloc[i]['organization country'])\n",
    "                organization_country_name = np.reshape(organization_country_name, (-1, 1))\n",
    "                \n",
    "                organization_involved_name = np.array(thisdf.iloc[i]['organization involved'])\n",
    "                organization_involved_name = np.reshape(organization_involved_name, (-1, 1))\n",
    "                \n",
    "                activity_name = np.array(thisdf.iloc[i]['concept:name'])\n",
    "                activity_name = np.reshape(activity_name, (-1, 1))\n",
    "                \n",
    "                impact_name = np.array(thisdf.iloc[i]['impact'])\n",
    "                impact_name = np.reshape(impact_name, (-1, 1))\n",
    "                \n",
    "                transition_name = np.array(thisdf.iloc[i]['lifecycle:transition'])\n",
    "                transition_name = np.reshape(transition_name, (-1, 1))\n",
    "\n",
    "                row.extend(ohe_rcn.transform(resource_country_name).tolist()[0])\n",
    "                row.extend(ohe_ocn.transform(organization_country_name).tolist()[0])\n",
    "                row.extend(ohe_oin.transform(organization_involved_name).tolist()[0])\n",
    "                row.extend(ohe_act.transform(activity_name).tolist()[0])  # 活动名称one-hot编码\n",
    "                row.extend(ohe_in.transform(impact_name).tolist()[0])\n",
    "                row.extend(ohe_tn.transform(transition_name).tolist()[0])\n",
    "                \n",
    "                row.append((event_dt - start_time).total_seconds()/3600/24)  # 总花费时间\n",
    "                row.append((event_dt - last_time).total_seconds()/3600/24)  # 相比上次活动花费时间\n",
    "                last_time = event_dt\n",
    "                row.append((event_dt - midnight_time).total_seconds()/3600/24)  # 距午夜时间\n",
    "                \n",
    "                row.append((end_time - event_dt).total_seconds()/3600/24)\n",
    "\n",
    "                tmpdata.append(row)\n",
    "                if i != thisdf.shape[0] - 1:\n",
    "                    data_set.append(tmpdata.copy())\n",
    "        return data_set, trace_length\n",
    "\n",
    "    train_data_set, max_train_trace_length = generate_data(train_cids_set)\n",
    "    test_data_set, max_test_trace_length = generate_data(test_cids_set)\n",
    "\n",
    "    min_value = [1e20] * (len(train_data_set[0][0]) - 1)\n",
    "    max_value = [-1] * (len(train_data_set[0][0]) - 1)\n",
    "    for element in train_data_set:\n",
    "        for row in element:\n",
    "            for i in range(1, len(row)-1):\n",
    "                min_value[i] = min(min_value[i], row[i])\n",
    "                max_value[i] = max(max_value[i], row[i])\n",
    "    train_data_new = []\n",
    "    for i in range(len(train_data_set)):\n",
    "        seq = []\n",
    "        for j in range(len(train_data_set[i])):\n",
    "            row = [train_data_set[i][j][0]]\n",
    "            for k in range(1, len(train_data_set[i][j])-1):\n",
    "                if max_value[k] == min_value[k]:\n",
    "                    row.append(train_data_set[i][j][k])\n",
    "                else:\n",
    "                    row.append((train_data_set[i][j][k]-min_value[k]) / (max_value[k]-min_value[k]))\n",
    "            row.append(train_data_set[i][j][-1])\n",
    "            seq.append(row)\n",
    "        train_data_new.append(seq)\n",
    "    test_data_new = []\n",
    "    for i in range(len(test_data_set)):\n",
    "        seq = []\n",
    "        for j in range(len(test_data_set[i])):\n",
    "            row = [test_data_set[i][j][0]]\n",
    "            for k in range(1, len(test_data_set[i][j])-1):\n",
    "                if max_value[k] == min_value[k]:\n",
    "                    row.append(test_data_set[i][j][k])\n",
    "                else:\n",
    "                    row.append((test_data_set[i][j][k]-min_value[k]) / (max_value[k]-min_value[k]))\n",
    "            row.append(test_data_set[i][j][-1])\n",
    "            seq.append(row)\n",
    "        test_data_new.append(seq)\n",
    "\n",
    "    train_data_set = sequence.pad_sequences(train_data_new, maxlen=20, dtype='float64')\n",
    "    test_data_set = sequence.pad_sequences(test_data_new, maxlen=20, dtype='float64')\n",
    "    return train_data_set, test_data_set\n",
    "\n",
    "train_data_set, test_data_set = prepare_train_data(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.save(\"../data/filtered_data/BPIC 2013/train_data.npy\", train_data_set)\n",
    "np.save(\"../data/filtered_data/BPIC 2013/test_data.npy\", test_data_set)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###### 数据过滤\n",
    "- 长度大于等于15小于等于50\n",
    "- 流程时间大于等于1小于等于30"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  resource country organization country organization involved concept:name  \\\n",
      "0           Sweden                   se            Org line C     Accepted   \n",
      "1           Sweden                   se            Org line C     Accepted   \n",
      "2           Sweden                   se            Org line C     Accepted   \n",
      "3           Sweden                   se            Org line C     Accepted   \n",
      "4           Sweden                   se            Org line C    Completed   \n",
      "\n",
      "   impact lifecycle:transition time:timestamp_short  case:concept:name  \\\n",
      "0  Medium          In Progress  2012-05-02 15:49:34          738576399   \n",
      "1  Medium          In Progress  2012-05-02 15:50:49          738576399   \n",
      "2  Medium                 Wait  2012-05-02 15:52:51          738576399   \n",
      "3  Medium          In Progress  2012-05-02 16:33:55          738576399   \n",
      "4  Medium             Resolved  2012-05-02 16:34:08          738576399   \n",
      "\n",
      "   time_spent  total_time_pred  total_time_true  \n",
      "0    0.000000        15.905615         7.391076  \n",
      "1    0.000868        15.905451         7.391076  \n",
      "2    0.002280        28.432586         7.391076  \n",
      "3    0.030799        15.899985         7.391076  \n",
      "4    0.030949         6.001347         7.391076  \n"
     ]
    }
   ],
   "source": [
    "data_path = \"../data/drl_data/bpic2013/bpic2013_wp_nn.csv\"\n",
    "\n",
    "df = pd.read_csv(data_path)\n",
    "print(df.head(5))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### case 1: 抽样case id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4908 4908\n",
      "5 123 0.003043981481481481 771.3517708333335\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAtIAAAF1CAYAAADFmw1hAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAjhUlEQVR4nO3dfbRddX3n8ffHBAFFKwwXGhI01KaM4KpgsyiWLsuICoo1dK2hK7badElLx8H6MHY00a5iZzUOVmsf1hQ7qVJji9DUhyEVn5DqsrYqBgQFIiWWFCKRxGdqO4yE7/xxdvRwuTe593fvebg379dad529f3vvs7/n3Hv2/tx9fnvvVBWSJEmSZudRoy5AkiRJWogM0pIkSVIDg7QkSZLUwCAtSZIkNTBIS5IkSQ0M0pIkSVIDg7QWvSQ7kzx7BOtdmaSSLB32uiVJU0vyriS/N+o6tDgYpDUQowqvo3QovmZJArd/OnQZpDUSHqWVpEOH23wtVgZpzbskfwk8EfjbJP+a5LV93RwuSnI38HfdvH+T5GtJvpPkU0lO7XueI5P8QZJ/6aZ/OsmR3bQzk/xjkm8nuSXJ2TOs7VFJ1if5SpJvJNmS5Jhu2v4a1yW5O8nXk7xhUj2bk3wryfbude2a7jX3rfaXp3o+SVoMZrrNT3L2/m1m37I/OJJ9oO3zFOvcnuQFfeNLu23s07vxafctk57nV5N8elJbJfnxbvjwJG/ttuH3Jfmzvv3QsUk+2O2Hvpnk75OYqw4x/sI176rqJcDdwM9X1VFV9ft9k38OeApwbjf+YWAVcBxwE3Bl37xvBX4K+BngGOC1wENJlgPXAr/Xtf8W8L4kEzMo7xXABV0dJwDfAv500jw/C5wMnAP8TpKndO2XAiuBHwOeA7x4hq95uueTpAVvltv8A5nJ9nm/q4AX9Y2fC3y9qm7qxg+0b5mNNwM/AZwG/DiwHPidbtprgF3ABHA88HqgGtejBcogrWF7Y1V9r6r+HaCqrqiq+6vqAeCNwNOS/Ej3X/1LgVdW1Veral9V/WM334uBD1XVh6rqoaq6DtgGPH8G6/8N4A1Vtatvnf950teOv1tV/15VtwC3AE/r2n8ReFNVfauqdgF/MsPXPN3zSdJi97Bt/kHMZPu833uAFyZ5TDf+S10bMP2+ZTaFJwnw68Crq+qbVXU/8CZgbTfL94FlwJOq6vtV9fdVZZA+xNhnScN2z/6BJEuAjcCF9P6jf6ibdCxwOHAE8JUpnuNJwIVJfr6v7TDgEzNY/5OADyR5qK9tH72jCft9rW/434CjuuET+uufNHwg0z2fJC12M91OwoG3z1/tn7GqdiTZDvx8kr8FXgicDgfdt3xnFvVMAI8BbuxlagACLOmG30IvpH+sm76pqi6bxfNrETBIa1Cm+6+8v/2XgDXAs4GdwI/Q+yovwNeB/ws8md5R3H73AH9ZVb/eUNc9wEur6h8mT0iy8iDL7gZWALd34ydOmu6RCEmHqpls879HL5gCPwi8/V3ypt0+T2N/945HAbdX1Y6u/UD7lskm1/SjfdO+Dvw7cGpVfXXygt0R6tcAr+n6YH8iyeer6voZ1q9FwK4dGpT76PUlPpDHAQ8A36C3IXvT/glV9RBwBfC2JCckWZLkGUkOB/6K3lGIc7v2I7qTWFbMoK4/AzYmeRJAkokka2b4mrYAG5Ic3fXTfvmk6TN5zZK0GM1k+/dPwBFJzk9yGPDb9L593G+22+ergecCL6OvWwcH2LdM4Rbg1CSnJTmC3hFm4Af7oT8H/jDJcV1Ny5Oc2w2/IMmPd11Avkvv6Pm+g7wHWmQM0hqU/wn8dnc2829NM8+7gX+h95Xd7cBnJ03/LeBLwOeBb9I76eNRVXUPvaMNrwf20juK8d+Z2d/zHwNb6X0Vd3+3zp+e4Wv6H/ROLLkL+DjwXnob6/1m8polaTE66Pavqr4D/FfgHfS2+9+jt03db1bb56raDXyG3gnpf9036WD7lv7n+Cd62/aPA3cCn540y+uAHcBnk3y3m+/kbtqqbvxfuzour6pPTrcuLU6xX7zUJsnLgLVV9XOjrkWSJA2fR6SlGUqyLMlZ3bVOT6bXN+4Do65LkiSNhicbSjP3aOB/AycB36bXP+/yURYkSZJGx64dkiRJUgO7dkiSJEkNDNKSJElSg7HvI33sscfWypUrR12GpEPcjTfe+PWqmjj4nJott/OSxkHLdn7sg/TKlSvZtm3bqMuQdIhL8i+jrmGxcjsvaRy0bOcP2rUjyRVJ9iS5ta/tLUm+nOSLST6Q5Al90zYk2ZHkjv13/+nafyrJl7ppf5K+G9dLkiRJC81M+ki/CzhvUtt1wFOr6ifp3fJzA0CSU4C1wKndMpcnWdIt83bgYnp3Alo1xXNKkiRJC8ZBg3RVfYre7Zn72z5WVQ92o58FVnTDa4Crq+qBqrqL3m01z0iyDHh8VX2metfbezdwwTy9BkmSJGno5uOqHS8FPtwNLwfu6Zu2q2tb3g1Pbp9SkouTbEuybe/evfNQoiRJkjS/5hSkk7wBeBC4cn/TFLPVAdqnVFWbqmp1Va2emPAkeUmSJI2f5qt2JFkHvAA4p354e8RdwIl9s60A7u3aV0zRLkmSJC1ITUekk5wHvA54YVX9W9+krcDaJIcnOYneSYU3VNVu4P4kZ3ZX6/gV4Jo51i5JkiSNzEGPSCe5CjgbODbJLuBSelfpOBy4rruK3Wer6r9U1W1JtgC30+vycUlV7eue6mX0rgByJL0+1R9GkiRJWqAOGqSr6kVTNL/zAPNvBDZO0b4NeOqsqpMkSZLG1HxctUOSJEk65BikJUmSpAYGaUmSJKmBQVqSJElq0Hwd6XG2cv21Dxvfedn5I6pEkha2JEcAn6J3paalwHur6tIkxwB/DawEdgK/WFXf6pbZAFwE7ANeUVUfHUHpkjRwHpGWJB3IA8CzquppwGnAeUnOBNYD11fVKuD6bpwkpwBrgVOB84DLkywZReGSNGgGaUnStKrnX7vRw7qfAtYAm7v2zcAF3fAa4OqqeqCq7gJ2AGcMr2JJGh6DtCTpgJIsSXIzsAe4rqo+Bxzf3bWW7vG4bvblwD19i+/q2iY/58VJtiXZtnfv3oHWL0mDYpCWJB1QVe2rqtOAFcAZSQ50c61M9RRTPOemqlpdVasnJibmqVJJGi6DtCRpRqrq28An6fV9vi/JMoDucU832y7gxL7FVgD3Dq9KSRqeRXnVjskmX8UDvJKHJM1Ekgng+1X17SRHAs8G3gxsBdYBl3WP13SLbAXek+RtwAnAKuCGoRcuSUNwSARpSVKzZcDm7sobjwK2VNUHk3wG2JLkIuBu4EKAqrotyRbgduBB4JKq2jei2iVpoAzSkqRpVdUXgdOnaP8GcM40y2wENg64NEkaOftIS5IkSQ0M0pIkSVIDg7QkSZLUwCAtSZIkNTBIS5IkSQ0M0pIkSVIDg7QkSZLUwCAtSZIkNTBIS5IkSQ0M0pIkSVIDg7QkSZLUwCAtSZIkNTBIS5IkSQ0M0pIkSVIDg7QkSZLUwCAtSZIkNTBIS5IkSQ0M0pIkSVIDg7QkSZLUwCAtSZIkNTBIS5IkSQ0M0pIkSVIDg7QkSZLUwCAtSZIkNTBIS5IkSQ0M0pIkSVIDg7QkSZLUwCAtSZIkNTBIS5IkSQ0M0pIkSVIDg7QkSZLU4KBBOskVSfYkubWv7Zgk1yW5s3s8um/ahiQ7ktyR5Ny+9p9K8qVu2p8kyfy/HEmSJGk4ZnJE+l3AeZPa1gPXV9Uq4PpunCSnAGuBU7tlLk+ypFvm7cDFwKruZ/JzSpIkSQvGQYN0VX0K+Oak5jXA5m54M3BBX/vVVfVAVd0F7ADOSLIMeHxVfaaqCnh33zKSJEnSgtPaR/r4qtoN0D0e17UvB+7pm29X17a8G57cLkmSJC1I832y4VT9nusA7VM/SXJxkm1Jtu3du3feipMkSZLmS2uQvq/rrkH3uKdr3wWc2DffCuDern3FFO1TqqpNVbW6qlZPTEw0lihJkiQNTmuQ3gqs64bXAdf0ta9NcniSk+idVHhD1/3j/iRndlfr+JW+ZSRJkqQFZ+nBZkhyFXA2cGySXcClwGXAliQXAXcDFwJU1W1JtgC3Aw8Cl1TVvu6pXkbvCiBHAh/ufiRJkqQF6aBBuqpeNM2kc6aZfyOwcYr2bcBTZ1WdJEmSNKa8s6EkSZLUwCAtSZIkNTBIS5KmleTEJJ9Isj3JbUle2bW/MclXk9zc/Ty/b5kNSXYkuSPJuaOrXpIG66B9pCVJh7QHgddU1U1JHgfcmOS6btofVtVb+2dOcgqwFjgVOAH4eJKf6DvxXJIWDY9IS5KmVVW7q+qmbvh+YDsHvjPtGuDqqnqgqu4CdgBnDL5SSRo+g7QkaUaSrAROBz7XNb08yReTXJHk6K5tOXBP32K7OHDwlqQFyyAtSTqoJEcB7wNeVVXfBd4OPBk4DdgN/MH+WadYvKZ4vouTbEuybe/evYMpWpIGzCAtSTqgJIfRC9FXVtX7AarqvqraV1UPAX/OD7tv7AJO7Ft8BXDv5Oesqk1VtbqqVk9MTAz2BUjSgBikJUnTShLgncD2qnpbX/uyvtl+Abi1G94KrE1yeJKTgFXADcOqV5KGyat2SJIO5CzgJcCXktzctb0eeFGS0+h129gJ/AZAVd2WZAtwO70rflziFTskLVYGaUnStKrq00zd7/lDB1hmI7BxYEVJ0piwa4ckSZLUwCAtSZIkNTBIS5IkSQ0M0pIkSVIDg7QkSZLUwCAtSZIkNTBIS5IkSQ0M0pIkSVIDg7QkSZLUwCAtSZIkNTBIS5IkSQ0M0pIkSVIDg7QkSZLUwCAtSZIkNTBIS5IkSQ0M0pIkSVIDg7QkSZLUwCAtSZIkNTBIS5IkSQ0M0pIkSVIDg7QkSZLUwCAtSZIkNTBIS5IkSQ0M0pIkSVIDg7QkSZLUwCAtSZIkNTBIS5IkSQ0M0pIkSVIDg7QkSZLUwCAtSZIkNTBIS5IkSQ0M0pIkSVIDg7QkSZLUwCAtSZIkNZhTkE7y6iS3Jbk1yVVJjkhyTJLrktzZPR7dN/+GJDuS3JHk3LmXL0mSJI1Gc5BOshx4BbC6qp4KLAHWAuuB66tqFXB9N06SU7rppwLnAZcnWTK38iVJkqTRmGvXjqXAkUmWAo8B7gXWAJu76ZuBC7rhNcDVVfVAVd0F7ADOmOP6JUmSpJFoDtJV9VXgrcDdwG7gO1X1MeD4qtrdzbMbOK5bZDlwT99T7OraHiHJxUm2Jdm2d+/e1hIlSZKkgZlL146j6R1lPgk4AXhskhcfaJEp2mqqGatqU1WtrqrVExMTrSVKkiRJAzOXrh3PBu6qqr1V9X3g/cDPAPclWQbQPe7p5t8FnNi3/Ap6XUEkSZKkBWcuQfpu4Mwkj0kS4BxgO7AVWNfNsw64phveCqxNcniSk4BVwA1zWL8kSZI0MktbF6yqzyV5L3AT8CDwBWATcBSwJclF9ML2hd38tyXZAtzezX9JVe2bY/2SJEnSSDQHaYCquhS4dFLzA/SOTk81/0Zg41zWKUkaniQnAu8GfhR4CNhUVX+c5Bjgr4GVwE7gF6vqW90yG4CLgH3AK6rqoyMoXZIGzjsbSpIO5EHgNVX1FOBM4JLuvgDeM0DSIc8gLUmaVlXtrqqbuuH76Z0LsxzvGSBJBmlJ0swkWQmcDnyOebhngCQtdAZpSdJBJTkKeB/wqqr67oFmnaLtEfcM8MZbkhYDg7Qk6YCSHEYvRF9ZVe/vmud0zwBvvCVpMTBIS5Km1d0n4J3A9qp6W98k7xkg6ZA3p8vfSZIWvbOAlwBfSnJz1/Z64DK8Z4CkQ5xBWpI0rar6NFP3ewbvGSDpEGfXDkmSJKmBQVqSJElqYJCWJEmSGhikJUmSpAYGaUmSJKmBQVqSJElqYJCWJEmSGhikJUmSpAYGaUmSJKmBQVqSJElqYJCWJEmSGiwddQGjsnL9tQ8b33nZ+SOqRJIkSQuRR6QlSZKkBgZpSZIkqYFBWpIkSWpgkJYkSZIaGKQlSZKkBgZpSZIkqYFBWpIkSWpgkJYkSZIaGKQlSZKkBgZpSZIkqYFBWpIkSWpgkJYkSZIaGKQlSZKkBgZpSZIkqYFBWpIkSWpgkJYkSZIaGKQlSZKkBgZpSZIkqYFBWpIkSWpgkJYkSZIaGKQlSZKkBgZpSZIkqYFBWpIkSWpgkJYkSZIazClIJ3lCkvcm+XKS7UmekeSYJNclubN7PLpv/g1JdiS5I8m5cy9fkiRJGo25HpH+Y+AjVfUfgacB24H1wPVVtQq4vhsnySnAWuBU4Dzg8iRL5rh+SZIkaSSag3SSxwPPBN4JUFX/r6q+DawBNnezbQYu6IbXAFdX1QNVdRewAzijdf2SJEnSKM3liPSPAXuBv0jyhSTvSPJY4Piq2g3QPR7Xzb8cuKdv+V1dmyRJkrTgzCVILwWeDry9qk4HvkfXjWMamaKtppwxuTjJtiTb9u7dO4cSJUmSpMGYS5DeBeyqqs914++lF6zvS7IMoHvc0zf/iX3LrwDuneqJq2pTVa2uqtUTExNzKFGSJEkajOYgXVVfA+5JcnLXdA5wO7AVWNe1rQOu6Ya3AmuTHJ7kJGAVcEPr+iVJkqRRmutVO34TuDLJF4HTgDcBlwHPSXIn8JxunKq6DdhCL2x/BLikqvbNcf2SpAFKckWSPUlu7Wt7Y5KvJrm5+3l+3zQvcyrpkLF0LgtX1c3A6ikmnTPN/BuBjXNZpyRpqN4F/C/g3ZPa/7Cq3trfMOkypycAH0/yEx40kbRYeWdDSdK0qupTwDdnOLuXOZV0SDFIS5JavDzJF7uuH/vvYDvjy5x6dSZJi4FBWpI0W28Hnkzv3JjdwB907TO+zKlXZ5K0GBikJUmzUlX3VdW+qnoI+HN+2H1jxpc5laTFwCAtSZqV/fcK6PwCsP+KHl7mVNIhZU5X7ZAkLW5JrgLOBo5Nsgu4FDg7yWn0um3sBH4Depc5TbL/MqcP4mVOJS1yBmlJ0rSq6kVTNL/zAPN7mVNJhwy7dkiSJEkNDNKSJElSA4O0JEmS1MAgLUmSJDUwSEuSJEkNDNKSJElSA4O0JEmS1MAgLUmSJDUwSEuSJEkNDNKSJElSA4O0JEmS1MAgLUmSJDUwSEuSJEkNDNKSJElSA4O0JEmS1MAgLUmSJDUwSEuSJEkNlo66gHGxcv21Dxvfedn5I6pEkiRJC4FHpCVJkqQGBmlJkiSpgUFakiRJamCQliRJkhoYpCVJkqQGBmlJkiSpgUFakiRJamCQliRJkhoYpCVJkqQGBmlJkiSpgUFakiRJamCQliRJkhoYpCVJkqQGBmlJkiSpgUFakiRJamCQliRJkhoYpCVJkqQGBmlJkiSpgUFakiRJamCQliRJkhrMOUgnWZLkC0k+2I0fk+S6JHd2j0f3zbshyY4kdyQ5d67rliRJkkZlPo5IvxLY3je+Hri+qlYB13fjJDkFWAucCpwHXJ5kyTysX5IkSRq6OQXpJCuA84F39DWvATZ3w5uBC/rar66qB6rqLmAHcMZc1i9JkiSNylyPSP8R8Frgob6246tqN0D3eFzXvhy4p2++XV3bIyS5OMm2JNv27t07xxIlSZKk+dccpJO8ANhTVTfOdJEp2mqqGatqU1WtrqrVExMTrSVKkuYoyRVJ9iS5ta/Nc2EkibkdkT4LeGGSncDVwLOS/BVwX5JlAN3jnm7+XcCJfcuvAO6dw/olSYP3LnrntfTzXBhJYg5Buqo2VNWKqlpJb8P5d1X1YmArsK6bbR1wTTe8FVib5PAkJwGrgBuaK5ckDVxVfQr45qRmz4WRJGDpAJ7zMmBLkouAu4ELAarqtiRbgNuBB4FLqmrfANYvSRqsh50Lk6T/XJjP9s13wHNhgIsBnvjEJw6wVEkanHkJ0lX1SeCT3fA3gHOmmW8jsHE+1ilJGjuzOhcG2ASwevXqKeeRpHHnnQ0lSbPluTCShEFakjR7ngsjSQymj7QkaZFIchVwNnBskl3ApXgujCQBBmlJ0gFU1YummeS5MJIOeXbtkCRJkhoYpCVJkqQGBmlJkiSpgX2kp7Fy/bWPaNt52fkjqESSJEnjyCPSkiRJUgODtCRJktTAIC1JkiQ1MEhLkiRJDQzSkiRJUgODtCRJktTAIC1JkiQ1MEhLkiRJDQzSkiRJUgODtCRJktTAIC1JkiQ1MEhLkiRJDQzSkiRJUgODtCRJktTAIC1JkiQ1MEhLkiRJDQzSkiRJUgODtCRJktTAIC1JkiQ1MEhLkiRJDQzSkiRJUgODtCRJktTAIC1JkiQ1MEhLkiRJDQzSkiRJUgODtCRJktTAIC1JkiQ1MEhLkiRJDQzSkiRJUgODtCRJktTAIC1JkiQ1WDrqAhaSleuvfdj4zsvOH1ElkiRJGjWPSEuSJEkNDNKSJElSA4O0JEmS1MAgLUmSJDVoDtJJTkzyiSTbk9yW5JVd+zFJrktyZ/d4dN8yG5LsSHJHknPn4wVIkiRJozCXI9IPAq+pqqcAZwKXJDkFWA9cX1WrgOu7cbppa4FTgfOAy5MsmUvxkiRJ0qg0B+mq2l1VN3XD9wPbgeXAGmBzN9tm4IJueA1wdVU9UFV3ATuAM1rXL0karSQ7k3wpyc1JtnVt034rKUmLzbz0kU6yEjgd+BxwfFXthl7YBo7rZlsO3NO32K6ubarnuzjJtiTb9u7dOx8lSpIG4z9V1WlVtbobn/JbSUlajOZ8Q5YkRwHvA15VVd9NMu2sU7TVVDNW1SZgE8Dq1aunnGcceIMWSXqENcDZ3fBm4JPA60ZVjCQN0pyOSCc5jF6IvrKq3t8135dkWTd9GbCna98FnNi3+Arg3rmsX5I0UgV8LMmNSS7u2qb7VlKSFp25XLUjwDuB7VX1tr5JW4F13fA64Jq+9rVJDk9yErAKuKF1/ZKkkTurqp4OPI/eCefPnOmCduGTtBjM5Yj0WcBLgGd1J5rcnOT5wGXAc5LcCTynG6eqbgO2ALcDHwEuqap9c6pekjQyVXVv97gH+AC9E8in+1Zy8rKbqmp1Va2emJgYVsmSNK/mctWOT1dVquonuxNNTquqD1XVN6rqnKpa1T1+s2+ZjVX15Ko6uao+PD8vQZI0bEkem+Rx+4eB5wK3Mv23kvNq8jkqkjQKcz7ZUJJ0SDoe+EB3gvlS4D1V9ZEknwe2JLkIuBu4cIQ1StJAGaQlSbNWVf8MPG2K9m8A5wy/Ikkavnm5jrQkSZJ0qDFIS5IkSQ0M0pIkSVIDg7QkSZLUwCAtSZIkNTBIS5IkSQ0M0pIkSVIDg7QkSZLUwCAtSZIkNTBIS5IkSQ0M0pIkSVKDpaMuYDFZuf7aR7TtvOz8EVQiSZKkQfOItCRJktTAIC1JkiQ1MEhLkiRJDQzSkiRJUgODtCRJktTAIC1JkiQ1MEhLkiRJDQzSkiRJUgODtCRJktTAOxsO2eS7H3rnQ0mSpIXJI9KSJElSA4O0JEmS1MCuHQM2uSvHTKbb3UOSJGn8eURakiRJamCQliRJkhoYpCVJC9rButBJ0qAYpCVJkqQGBmlJkiSpgUFakiRJamCQliRJkhoYpCVJkqQGBmlJkiSpgUFakiRJauAtwheAmVwj1duKS5IkDZdBegx5cwFJmpn928uV669l52Xn/+BRkobBrh2SpEXHAxKShsEgLUlakKYLy/3tK9dfa6iWNDAGaUnSojA5MBugJQ2afaQXick7DPsISpIkDZZB+hAx1ZEZw7YkSVK7oXftSHJekjuS7EiyftjrlyQNltt5SYeKoR6RTrIE+FPgOcAu4PNJtlbV7cOsQ1PzetWS5mpct/NeFk/SIAy7a8cZwI6q+meAJFcDawCD9DybSShuORHnYMtMtaOaj2XcAUoLhtt5SYeMYQfp5cA9feO7gJ8ecg0aoEGdJT+TYD2q8G3olx5mbLfz0x2Vnm67tf8GL/uHp1umf9qBjnzP91Hxgz3ffKxvutc/3/zGQAtVqmp4K0suBM6tql/rxl8CnFFVvzlpvouBi7vRk4E7gGOBrw+t2Pmx0GpeaPWCNQ+LNcOTqmpiHp9vUZrjdn42xulvclxqsY5HGpdarOORxqWW/jpmvZ0f9hHpXcCJfeMrgHsnz1RVm4BN/W1JtlXV6sGWN78WWs0LrV6w5mGxZs1C83Z+Nsbp9zsutVjHI41LLdbxSONSy1zrGPZVOz4PrEpyUpJHA2uBrUOuQZI0OG7nJR0yhnpEuqoeTPJy4KPAEuCKqrptmDVIkgbH7bykQ8nQb8hSVR8CPtSwaPNXgCO00GpeaPWCNQ+LNWvG5rCdn41x+v2OSy3W8UjjUot1PNK41DKnOoZ6sqEkSZK0WAz9zoaSJEnSYjD2QXoh3Go2yYlJPpFke5Lbkryyaz8myXVJ7uwejx51rf2SLEnyhSQf7MbHul6AJE9I8t4kX+7e72eMc91JXt39Tdya5KokR4xjvUmuSLInya19bdPWmWRD95m8I8m5Y1LvW7q/iy8m+UCSJ4xLvZpfw9wvjMtno2U/M8BajkhyQ5Jbulp+d1S1dM89433ZAN+TnUm+lOTmJNtGWMes9pEDrOPk7r3Y//PdJK8aUS2vziz2w7Ouo6rG9ofeiSpfAX4MeDRwC3DKqOuaos5lwNO74ccB/wScAvw+sL5rXw+8edS1Tqr7vwHvAT7YjY91vV1dm4Ff64YfDTxhXOumd2OKu4Aju/EtwK+OY73AM4GnA7f2tU1ZZ/e3fQtwOHBS9xldMgb1PhdY2g2/eZzq9Wdef/dD3S+My2djtvuZAdcS4Khu+DDgc8CZo9pmMMN92YDfk53AsZPaRlHHjPeRw9o2dp/ZrwFPGnYtzHI/3FLHvL5ZA3jznwF8tG98A7Bh1HXNoO5rgOfQu8HAsq5tGXDHqGvrq3EFcD3wrL6Nz9jW29X0+O4DkUntY1k3P7zD2zH0Tuz9IL2wN671ruThYWHKOid/DuldneEZo6530rRfAK4cp3r9mbff+9D3C+P42TjYfmZYtQCPAW6id/fKodcym33ZgOvYySOD9FDrYJb7yCH+jTwX+IcRvSez2g+31DHuXTumutXs8hHVMiNJVgKn0/sP/fiq2g3QPR43wtIm+yPgtcBDfW3jXC/0jkDtBf6i+xrvHUkey5jWXVVfBd4K3A3sBr5TVR9jTOudwnR1LoTP5UuBD3fDC6Fezdw4/D5H+tmY4X5moLV03SluBvYA11XVqGr5I2a+LxtkHQV8LMmN6d21cxR1zHYfOazP0lrgqm54qLU07IdnXce4B+lM0Ta2lxlJchTwPuBVVfXdUdcznSQvAPZU1Y2jrmWWltL7ivXtVXU68D16X8mMpa7P1Rp6Xw+dADw2yYtHW9W8GOvPZZI3AA8CV+5vmmK2salXszbOv8+B1zaL/cxAa6mqfVV1Gr0jwmckeeqwa2nYlw3yPTmrqp4OPA+4JMkzR1DHbPeRw/h7fTTwQuBvDjbrIGpp2A/Puo5xD9IzutXsOEhyGL2N25VV9f6u+b4ky7rpy+j95z4OzgJemGQncDXwrCR/xfjWu98uYFd35APgvfQ2GuNa97OBu6pqb1V9H3g/8DOMb72TTVfn2H4uk6wDXgD8cnXfyzHG9arJOPw+R/LZmOV+ZijvU1V9G/gkcN4Iapntvmxg70lV3ds97gE+AJwxgjpmu48cxt/I84Cbquq+bnzYtcx2PzzrOsY9SC+IW80mCfBOYHtVva1v0lZgXTe8jl6ftpGrqg1VtaKqVtJ7T/+uql7MmNa7X1V9Dbgnycld0znA7Yxv3XcDZyZ5TPc3cg6wnfGtd7Lp6twKrE1yeJKTgFXADSOo72GSnAe8DnhhVf1b36SxrFfNxmG/MPTPRsN+ZpC1TKS7Kk6SI+mFlS8Pu5aGfdlA6kjy2CSP2z9Mrw/urcOuo2EfOYxt44v4YbeO/escZi2z3Q/Pvo757lQ+3z/A8+mdnfwV4A2jrmeaGn+W3qH/LwI3dz/PB/4DvZMg7uwejxl1rVPUfjY/PEFjIdR7GrCte6//D3D0ONcN/C69HcytwF/SOxN47Oqlt6HbDXyf3n/kFx2oTuAN3WfyDuB5Y1LvDnp92/Z/Bv9sXOr1Z95//0PbL4zLZ6NlPzPAWn4S+EJXy63A73TtI9tmzHRfNog66PVNvqX7uW3/3+SIfjenMYt95CB/L/RORP0G8CN9baN4T2a1H55tHd7ZUJIkSWow7l07JEmSpLFkkJYkSZIaGKQlSZKkBgZpSZIkqYFBWpIkSWpgkJYkSZIaGKQlSZKkBgZpSZIkqcH/B+oQYT7AMtHXAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 864x432 with 2 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "display_dataset(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "820\n"
     ]
    }
   ],
   "source": [
    "filtered_cids = []\n",
    "for cid in list(set(df[\"case:concept:name\"].values)):\n",
    "    this_df = df[df[\"case:concept:name\"]==cid]\n",
    "    true_value = this_df.iloc[0]['total_time_true']\n",
    "    if true_value >= 1 and true_value <= 30:\n",
    "        if this_df.shape[0] >= 15 and this_df.shape[0] <= 50:\n",
    "            filtered_cids.append(cid)\n",
    "print(len(filtered_cids))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "200 50\n",
      "(5270, 11)\n"
     ]
    }
   ],
   "source": [
    "df_samples, train_cids, test_cids = sample_data(df, filtered_cids, 200, 50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "from config import Variables\n",
    "\n",
    "save_path = Variables.DeepReinforceLearningParameters(dataset_name=\"bpic2013\").DATA_PATH\n",
    "\n",
    "df_samples.to_csv(save_path+\"/bpic2013_sample250.csv\")\n",
    "np.save(save_path+\"/train_case_ids.npy\", np.array(train_cids))\n",
    "np.save(save_path+\"/test_case_ids.npy\", np.array(test_cids))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### case 2: 读取case id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "train_cids = np.load(\"../data/drl_data/bpic2013/train_case_ids.npy\")\n",
    "test_cids = np.load(\"../data/drl_data/bpic2013/test_case_ids.npy\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(5270, 11)\n"
     ]
    }
   ],
   "source": [
    "df_sample = pd.DataFrame()\n",
    "for cid in set(df[\"case:concept:name\"]):\n",
    "    if cid in train_cids or cid in test_cids:\n",
    "        df_sample = df_sample.append(df[df[\"case:concept:name\"] == cid])\n",
    "print(df_sample.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "from config import DRLParameters\n",
    "\n",
    "save_path = DRLParameters(dataset_name=\"bpic2013\").DATA_PATH\n",
    "\n",
    "df_sample.to_csv(save_path+\"/bpic2013_nn_sample250.csv\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### EnvironmentalPermit数据预处理\n",
    "###### (已废弃)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b1d7efd0b0544d018d3ae8df9304440e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(HTML(value='parsing log, completed traces :: '), FloatProgress(value=0.0, max=937.0), HTML(valu…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "0\n",
      "10000\n",
      "20000\n",
      "30000\n",
      "(38925, 9)\n"
     ]
    }
   ],
   "source": [
    "from data_cleaner.data_preprocess import EnvironmentalPermitPreprocess\n",
    "\n",
    "df = EnvironmentalPermitPreprocess().preprocess()\n",
    "print(df.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true,
     "source_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train_data_size:  651 , test_data_size:  280\n"
     ]
    }
   ],
   "source": [
    "def prepare_train_data(data_processed):\n",
    "    all_case_ids = set(data_processed['case:concept:name'].values)\n",
    "    num_trainset = len(all_case_ids) * 7 // 10\n",
    "    random.seed(0)\n",
    "    train_cids = random.sample(all_case_ids, num_trainset)\n",
    "    train_cids_set = set(train_cids)\n",
    "    test_cids_set = all_case_ids - train_cids_set\n",
    "    print('train_data_size: ', len(train_cids_set), ', test_data_size: ', len(test_cids_set))\n",
    "\n",
    "    resource_names = np.array(data_processed['org:resource'].values)\n",
    "    resource_names = np.reshape(resource_names, (resource_names.shape[0], 1))\n",
    "    ohe_rn = OneHotEncoder(sparse=False)\n",
    "    ohe_rn.fit(resource_names)\n",
    "    \n",
    "    status_names = np.array(data_processed['case:caseStatus'].values)\n",
    "    status_names = np.reshape(status_names, (status_names.shape[0], 1))\n",
    "    ohe_sn = OneHotEncoder(sparse=False)\n",
    "    ohe_sn.fit(status_names)\n",
    "    \n",
    "    activity_names = np.array(data_processed['concept:name'].values)\n",
    "    activity_names = np.reshape(activity_names, (activity_names.shape[0], 1))\n",
    "    ohe_act = OneHotEncoder(sparse=False)\n",
    "    ohe_act.fit(activity_names)\n",
    "    \n",
    "    includes_subcases_names = np.array(data_processed['case:Includes_subCases'].values)\n",
    "    includes_subcases_names = np.reshape(includes_subcases_names, (includes_subcases_names.shape[0], 1))\n",
    "    ohe_isn = OneHotEncoder(sparse=False)\n",
    "    ohe_isn.fit(includes_subcases_names)\n",
    "\n",
    "    request_complete_names = np.array(data_processed['case:requestComplete'].values)\n",
    "    request_complete_names = np.reshape(request_complete_names, (request_complete_names.shape[0], 1))\n",
    "    ohe_rcn = OneHotEncoder(sparse=False)\n",
    "    ohe_rcn.fit(request_complete_names)\n",
    "    \n",
    "    last_phase_names = np.array(data_processed['case:last_phase'].values)\n",
    "    last_phase_names = np.reshape(last_phase_names, (last_phase_names.shape[0], 1))\n",
    "    ohe_lpn = OneHotEncoder(sparse=False)\n",
    "    ohe_lpn.fit(last_phase_names)\n",
    "\n",
    "    def generate_data(cid_set):\n",
    "        data_set = []\n",
    "        trace_length = 0\n",
    "        for cid in cid_set:\n",
    "            thisdf = data_processed[data_processed['case:concept:name'] == cid]\n",
    "            trace_length = max(trace_length, thisdf.shape[0])\n",
    "            tmpdata = []\n",
    "            \n",
    "            start_time = datetime.datetime.strptime(thisdf.iloc[0]['time:timestamp_short'], '%Y-%m-%d %H:%M:%S')\n",
    "            end_time = datetime.datetime.strptime(thisdf.iloc[-1]['time:timestamp_short'], '%Y-%m-%d %H:%M:%S')\n",
    "            last_time = start_time\n",
    "\n",
    "            for i in range(thisdf.shape[0]):\n",
    "                row = [int(thisdf.iloc[i]['case:concept:name'])]\n",
    "                \n",
    "                event_dt = datetime.datetime.strptime(thisdf.iloc[i]['time:timestamp_short'], '%Y-%m-%d %H:%M:%S')\n",
    "                midnight_time = event_dt.replace(hour=0, minute=0, second=0, microsecond=0)\n",
    "                \n",
    "                resource_name = np.array(thisdf.iloc[i]['org:resource'])\n",
    "                resource_name = np.reshape(resource_name, (-1, 1))\n",
    "                \n",
    "                status_name = np.array(thisdf.iloc[i]['case:caseStatus'])\n",
    "                status_name = np.reshape(status_name, (-1, 1))\n",
    "                \n",
    "                includes_subcases_name = np.array(thisdf.iloc[i]['case:Includes_subCases'])\n",
    "                includes_subcases_name = np.reshape(includes_subcases_name, (-1, 1))\n",
    "                \n",
    "                activity_name = np.array(thisdf.iloc[i]['concept:name'])\n",
    "                activity_name = np.reshape(activity_name, (-1, 1))\n",
    "                \n",
    "                request_complete_name = np.array(thisdf.iloc[i]['case:requestComplete'])\n",
    "                request_complete_name = np.reshape(request_complete_name, (-1, 1))\n",
    "                \n",
    "                last_phase_name = np.array(thisdf.iloc[i]['case:last_phase'])\n",
    "                last_phase_name = np.reshape(last_phase_name, (-1, 1))\n",
    "\n",
    "                row.extend(ohe_rn.transform(resource_name).tolist()[0])\n",
    "                row.extend(ohe_sn.transform(status_name).tolist()[0])\n",
    "                row.extend(ohe_isn.transform(includes_subcases_name).tolist()[0])\n",
    "                row.extend(ohe_act.transform(activity_name).tolist()[0])  # 活动名称one-hot编码\n",
    "                row.extend(ohe_rcn.transform(request_complete_name).tolist()[0])\n",
    "                row.extend(ohe_lpn.transform(last_phase_name).tolist()[0])\n",
    "                \n",
    "                row.append((event_dt - start_time).total_seconds()/3600/24)  # 总花费时间\n",
    "                row.append((event_dt - last_time).total_seconds()/3600/24)  # 相比上次活动花费时间\n",
    "                last_time = event_dt\n",
    "                row.append((event_dt - midnight_time).total_seconds()/3600/24)  # 距午夜时间\n",
    "                \n",
    "                row.append((end_time - event_dt).total_seconds()/3600/24)\n",
    "\n",
    "                tmpdata.append(row)\n",
    "                if i != thisdf.shape[0] - 1:\n",
    "                    data_set.append(tmpdata.copy())\n",
    "        return data_set, trace_length\n",
    "\n",
    "    train_data_set, max_train_trace_length = generate_data(train_cids_set)\n",
    "    test_data_set, max_test_trace_length = generate_data(test_cids_set)\n",
    "\n",
    "    min_value = [1e20] * (len(train_data_set[0][0]) - 1)\n",
    "    max_value = [-1] * (len(train_data_set[0][0]) - 1)\n",
    "    for element in train_data_set:\n",
    "        for row in element:\n",
    "            for i in range(1, len(row)-1):\n",
    "                min_value[i] = min(min_value[i], row[i])\n",
    "                max_value[i] = max(max_value[i], row[i])\n",
    "    train_data_new = []\n",
    "    for i in range(len(train_data_set)):\n",
    "        seq = []\n",
    "        for j in range(len(train_data_set[i])):\n",
    "            row = [train_data_set[i][j][0]]\n",
    "            for k in range(1, len(train_data_set[i][j])-1):\n",
    "                if max_value[k] == min_value[k]:\n",
    "                    row.append(train_data_set[i][j][k])\n",
    "                else:\n",
    "                    row.append((train_data_set[i][j][k]-min_value[k]) / (max_value[k]-min_value[k]))\n",
    "            row.append(train_data_set[i][j][-1])\n",
    "            seq.append(row)\n",
    "        train_data_new.append(seq)\n",
    "    test_data_new = []\n",
    "    for i in range(len(test_data_set)):\n",
    "        seq = []\n",
    "        for j in range(len(test_data_set[i])):\n",
    "            row = [test_data_set[i][j][0]]\n",
    "            for k in range(1, len(test_data_set[i][j])-1):\n",
    "                if max_value[k] == min_value[k]:\n",
    "                    row.append(test_data_set[i][j][k])\n",
    "                else:\n",
    "                    row.append((test_data_set[i][j][k]-min_value[k]) / (max_value[k]-min_value[k]))\n",
    "            row.append(test_data_set[i][j][-1])\n",
    "            seq.append(row)\n",
    "        test_data_new.append(seq)\n",
    "\n",
    "    train_data_set = sequence.pad_sequences(train_data_new, maxlen=20, dtype='float64')\n",
    "    test_data_set = sequence.pad_sequences(test_data_new, maxlen=20, dtype='float64')\n",
    "    return train_data_set, test_data_set\n",
    "\n",
    "train_data_set, test_data_set = prepare_train_data(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.save(\"../data/filtered_data/Environmental Permit/train_data.npy\", train_data_set)\n",
    "np.save(\"../data/filtered_data/Environmental Permit/test_data.npy\", test_data_set)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Hospital Billing数据处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "36ba61b5ec414df59bf022fca62c7947",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(HTML(value='parsing log, completed traces :: '), FloatProgress(value=0.0, max=100000.0), HTML(v…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "data_path = \"../data/raw_data/HospitalBilling/12705113/Hospital Billing - Event Log.xes\"\n",
    "\n",
    "import pm4py\n",
    "\n",
    "log = pm4py.read_xes(data_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pm4py.convert_to_dataframe(log)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>isCancelled</th>\n",
       "      <th>diagnosis</th>\n",
       "      <th>time:timestamp</th>\n",
       "      <th>caseType</th>\n",
       "      <th>speciality</th>\n",
       "      <th>org:resource</th>\n",
       "      <th>concept:name</th>\n",
       "      <th>blocked</th>\n",
       "      <th>isClosed</th>\n",
       "      <th>flagD</th>\n",
       "      <th>...</th>\n",
       "      <th>lifecycle:transition</th>\n",
       "      <th>case:concept:name</th>\n",
       "      <th>closeCode</th>\n",
       "      <th>actRed</th>\n",
       "      <th>actOrange</th>\n",
       "      <th>flagC</th>\n",
       "      <th>msgCount</th>\n",
       "      <th>version</th>\n",
       "      <th>msgType</th>\n",
       "      <th>msgCode</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>False</td>\n",
       "      <td>A</td>\n",
       "      <td>2012-12-16 19:33:10+01:00</td>\n",
       "      <td>A</td>\n",
       "      <td>A</td>\n",
       "      <td>ResA</td>\n",
       "      <td>NEW</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>complete</td>\n",
       "      <td>A</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2013-12-15 19:00:37+01:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>FIN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>complete</td>\n",
       "      <td>A</td>\n",
       "      <td>A</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2013-12-16 03:53:38+01:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>RELEASE</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>complete</td>\n",
       "      <td>A</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2013-12-17 12:56:29+01:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CODE OK</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>complete</td>\n",
       "      <td>A</td>\n",
       "      <td>NaN</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>0.0</td>\n",
       "      <td>A</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2013-12-19 03:44:31+01:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ResB</td>\n",
       "      <td>BILLED</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>complete</td>\n",
       "      <td>A</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>451354</th>\n",
       "      <td>False</td>\n",
       "      <td>OM</td>\n",
       "      <td>2015-12-13 19:31:23+01:00</td>\n",
       "      <td>A</td>\n",
       "      <td>E</td>\n",
       "      <td>ResA</td>\n",
       "      <td>NEW</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>complete</td>\n",
       "      <td>AXQE</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>451355</th>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2016-01-14 21:17:47+01:00</td>\n",
       "      <td>B</td>\n",
       "      <td>L</td>\n",
       "      <td>ResDJ</td>\n",
       "      <td>NEW</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>complete</td>\n",
       "      <td>BXQE</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>451356</th>\n",
       "      <td>True</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2016-01-14 22:00:13+01:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ResFR</td>\n",
       "      <td>DELETE</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>complete</td>\n",
       "      <td>BXQE</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>451357</th>\n",
       "      <td>False</td>\n",
       "      <td>LL</td>\n",
       "      <td>2016-01-11 19:40:47+01:00</td>\n",
       "      <td>A</td>\n",
       "      <td>D</td>\n",
       "      <td>ResA</td>\n",
       "      <td>NEW</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>complete</td>\n",
       "      <td>CXQE</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>451358</th>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2016-01-13 22:48:13+01:00</td>\n",
       "      <td>I</td>\n",
       "      <td>K</td>\n",
       "      <td>ResJA</td>\n",
       "      <td>NEW</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>complete</td>\n",
       "      <td>DXQE</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>451359 rows × 23 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       isCancelled diagnosis             time:timestamp caseType speciality  \\\n",
       "0            False         A  2012-12-16 19:33:10+01:00        A          A   \n",
       "1              NaN       NaN  2013-12-15 19:00:37+01:00      NaN        NaN   \n",
       "2              NaN       NaN  2013-12-16 03:53:38+01:00      NaN        NaN   \n",
       "3              NaN       NaN  2013-12-17 12:56:29+01:00      NaN        NaN   \n",
       "4              NaN       NaN  2013-12-19 03:44:31+01:00      NaN        NaN   \n",
       "...            ...       ...                        ...      ...        ...   \n",
       "451354       False        OM  2015-12-13 19:31:23+01:00        A          E   \n",
       "451355       False       NaN  2016-01-14 21:17:47+01:00        B          L   \n",
       "451356        True       NaN  2016-01-14 22:00:13+01:00      NaN        NaN   \n",
       "451357       False        LL  2016-01-11 19:40:47+01:00        A          D   \n",
       "451358       False       NaN  2016-01-13 22:48:13+01:00        I          K   \n",
       "\n",
       "       org:resource concept:name blocked isClosed  flagD  ...  \\\n",
       "0              ResA          NEW   False     True   True  ...   \n",
       "1               NaN          FIN     NaN      NaN    NaN  ...   \n",
       "2               NaN      RELEASE     NaN      NaN    NaN  ...   \n",
       "3               NaN      CODE OK     NaN      NaN    NaN  ...   \n",
       "4              ResB       BILLED     NaN      NaN    NaN  ...   \n",
       "...             ...          ...     ...      ...    ...  ...   \n",
       "451354         ResA          NEW   False    False  False  ...   \n",
       "451355        ResDJ          NEW   False    False  False  ...   \n",
       "451356        ResFR       DELETE     NaN      NaN    NaN  ...   \n",
       "451357         ResA          NEW   False    False  False  ...   \n",
       "451358        ResJA          NEW   False     True  False  ...   \n",
       "\n",
       "       lifecycle:transition case:concept:name closeCode actRed actOrange  \\\n",
       "0                  complete                 A       NaN    NaN       NaN   \n",
       "1                  complete                 A         A    NaN       NaN   \n",
       "2                  complete                 A       NaN    NaN       NaN   \n",
       "3                  complete                 A       NaN  False     False   \n",
       "4                  complete                 A       NaN    NaN       NaN   \n",
       "...                     ...               ...       ...    ...       ...   \n",
       "451354             complete              AXQE       NaN    NaN       NaN   \n",
       "451355             complete              BXQE       NaN    NaN       NaN   \n",
       "451356             complete              BXQE       NaN    NaN       NaN   \n",
       "451357             complete              CXQE       NaN    NaN       NaN   \n",
       "451358             complete              DXQE       NaN    NaN       NaN   \n",
       "\n",
       "        flagC msgCount version msgType  msgCode  \n",
       "0         NaN      NaN     NaN     NaN      NaN  \n",
       "1         NaN      NaN     NaN     NaN      NaN  \n",
       "2         NaN      NaN     NaN     NaN      NaN  \n",
       "3       False      0.0       A     NaN      NaN  \n",
       "4         NaN      NaN     NaN     NaN      NaN  \n",
       "...       ...      ...     ...     ...      ...  \n",
       "451354    NaN      NaN     NaN     NaN      NaN  \n",
       "451355    NaN      NaN     NaN     NaN      NaN  \n",
       "451356    NaN      NaN     NaN     NaN      NaN  \n",
       "451357    NaN      NaN     NaN     NaN      NaN  \n",
       "451358    NaN      NaN     NaN     NaN      NaN  \n",
       "\n",
       "[451359 rows x 23 columns]"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5000 18\n",
      "10000 37\n",
      "15000 48\n",
      "20000 67\n",
      "25000 79\n",
      "30000 99\n",
      "35000 113\n",
      "40000 136\n",
      "45000 146\n",
      "50000 166\n",
      "55000 185\n",
      "60000 197\n",
      "65000 212\n",
      "70000 225\n",
      "75000 241\n",
      "80000 265\n",
      "85000 281\n",
      "90000 304\n",
      "95000 320\n",
      "100000 339\n"
     ]
    }
   ],
   "source": [
    "valid_cnts = []\n",
    "cid_cnt = 0\n",
    "for cid in set(df[\"case:concept:name\"]):\n",
    "    cid_cnt += 1\n",
    "    if cid_cnt % 5000 == 0:\n",
    "        print(cid_cnt, len(valid_cnts))\n",
    "    tmp_df = df[df[\"case:concept:name\"] == cid]\n",
    "    if tmp_df.shape[0] >= 15:\n",
    "        valid_cnts.append(cid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 LJJ\n",
      "1 DHM\n",
      "2 NCTB\n",
      "3 CDJA\n",
      "4 JBFC\n",
      "5 QPA\n",
      "6 XBHC\n",
      "7 IOTD\n",
      "8 XKYA\n",
      "9 MUE\n",
      "10 XJY\n",
      "11 FYH\n",
      "12 OMXB\n",
      "13 UIS\n",
      "14 VMC\n",
      "15 YQP\n",
      "16 SWCB\n",
      "17 WIU\n",
      "18 IARA\n",
      "19 OTYC\n",
      "20 ELKA\n",
      "21 LCXB\n",
      "22 PVCC\n",
      "23 UVCB\n",
      "24 JRFD\n",
      "25 FJDE\n",
      "26 RVAA\n",
      "27 ATH\n",
      "28 QWYB\n",
      "29 RHFB\n",
      "30 DQL\n",
      "31 HAQA\n",
      "32 RTZB\n",
      "33 ZGLB\n",
      "34 ZBWA\n",
      "35 AGND\n",
      "36 WYBB\n",
      "37 MJJB\n",
      "38 WREA\n",
      "39 MBDC\n",
      "40 YIZC\n",
      "41 FRAD\n",
      "42 WFK\n",
      "43 WHC\n",
      "44 GVJ\n",
      "45 FHKC\n",
      "46 QTNA\n",
      "47 JNPA\n",
      "48 LRVA\n",
      "49 BBH\n",
      "50 LLR\n",
      "51 ERY\n",
      "52 IRIC\n",
      "53 EFZD\n",
      "54 SUC\n",
      "55 AI\n",
      "56 YNBA\n",
      "57 TXOC\n",
      "58 AYG\n",
      "59 EOYB\n",
      "60 GIMA\n",
      "61 IUUB\n",
      "62 WLSB\n",
      "63 JOUA\n",
      "64 OTMD\n",
      "65 QZAB\n",
      "66 MIUA\n",
      "67 BOP\n",
      "68 KWUA\n",
      "69 XYJB\n",
      "70 TMOC\n",
      "71 WQY\n",
      "72 UCRA\n",
      "73 QAPA\n",
      "74 PHOB\n",
      "75 AHCD\n",
      "76 WMMB\n",
      "77 CBWD\n",
      "78 PFUA\n",
      "79 FIL\n",
      "80 MBL\n",
      "81 NLBD\n",
      "82 WMXA\n",
      "83 NLGC\n",
      "84 BUCA\n",
      "85 FZXC\n",
      "86 XUC\n",
      "87 KUM\n",
      "88 BQL\n",
      "89 UIEB\n",
      "90 UVNB\n",
      "91 JMOC\n",
      "92 PW\n",
      "93 GCB\n",
      "94 PBE\n",
      "95 RFKB\n",
      "96 UFWA\n",
      "97 ZJCC\n",
      "98 FUHD\n",
      "99 EWHA\n",
      "100 DDZ\n",
      "101 DI\n",
      "102 OYCC\n",
      "103 YFCB\n",
      "104 MSAE\n",
      "105 UVP\n",
      "106 RHWC\n",
      "107 NQH\n",
      "108 UZBD\n",
      "109 VOXA\n",
      "110 MWMC\n",
      "111 LFE\n",
      "112 IWPB\n",
      "113 HNGB\n",
      "114 QYQ\n",
      "115 ERNB\n",
      "116 XBDB\n",
      "117 OGL\n",
      "118 BVGA\n",
      "119 EQ\n",
      "120 IOEE\n",
      "121 UJMB\n",
      "122 OLF\n",
      "123 BPH\n",
      "124 MCDC\n",
      "125 FLM\n",
      "126 QHXB\n",
      "127 DTXB\n",
      "128 UGHC\n",
      "129 AORA\n",
      "130 FNGA\n",
      "131 JNS\n",
      "132 TRU\n",
      "133 FOLC\n",
      "134 DNDA\n",
      "135 PYRA\n",
      "136 MRYB\n",
      "137 AGU\n",
      "138 AOKB\n",
      "139 XFTA\n",
      "140 IUBD\n",
      "141 QKBC\n",
      "142 CEUB\n",
      "143 CYEC\n",
      "144 GMEB\n",
      "145 ESLA\n",
      "146 QCJ\n",
      "147 DDWA\n",
      "148 ZTKD\n",
      "149 KRJ\n",
      "150 VYOB\n",
      "151 SUJC\n",
      "152 NNOC\n",
      "153 GOCC\n",
      "154 ZMAD\n",
      "155 EXV\n",
      "156 ELJ\n",
      "157 GUKB\n",
      "158 KOD\n",
      "159 IGH\n",
      "160 GXYD\n",
      "161 NYFD\n",
      "162 GNTC\n",
      "163 LQUB\n",
      "164 AZGC\n",
      "165 EGXA\n",
      "166 CLWA\n",
      "167 JKIB\n",
      "168 BUJC\n",
      "169 PNJ\n",
      "170 KUSB\n",
      "171 FSX\n",
      "172 BLLE\n",
      "173 QMFD\n",
      "174 JUIB\n",
      "175 XYSC\n",
      "176 OPUB\n",
      "177 DCNC\n",
      "178 KPCC\n",
      "179 WTAC\n",
      "180 GQOA\n",
      "181 UGF\n",
      "182 LCBB\n",
      "183 EMC\n",
      "184 LVGB\n",
      "185 ULHA\n",
      "186 CFAE\n",
      "187 ANAC\n",
      "188 NOAA\n",
      "189 BTH\n",
      "190 LWRB\n",
      "191 YVMD\n",
      "192 LZAD\n",
      "193 WKSA\n",
      "194 KBDB\n",
      "195 KYS\n",
      "196 TFWB\n",
      "197 AAX\n",
      "198 TJVB\n",
      "199 VIYD\n",
      "200 RUDA\n",
      "201 SEQ\n",
      "202 QBLC\n",
      "203 DYBA\n",
      "204 YBNA\n",
      "205 LOZ\n",
      "206 JYGD\n",
      "207 OTBD\n",
      "208 SWP\n",
      "209 RNGD\n",
      "210 OCIC\n",
      "211 LHH\n",
      "212 GODA\n",
      "213 FUQA\n",
      "214 LQFE\n",
      "215 OUXC\n",
      "216 VGH\n",
      "217 AQL\n",
      "218 EDLC\n",
      "219 OXYC\n",
      "220 MPJD\n",
      "221 LNYA\n",
      "222 MWZ\n",
      "223 AJQA\n",
      "224 MZHA\n",
      "225 BEV\n",
      "226 LSN\n",
      "227 QGLB\n",
      "228 ILV\n",
      "229 CNFB\n",
      "230 JBQA\n",
      "231 KOAD\n",
      "232 DQFA\n",
      "233 UJOA\n",
      "234 SQF\n",
      "235 WYCD\n",
      "236 FWYA\n",
      "237 EUZC\n",
      "238 BORA\n",
      "239 SMCA\n",
      "240 KANA\n",
      "241 OQEA\n",
      "242 QZM\n",
      "243 ZJEC\n",
      "244 DYBB\n",
      "245 IWBE\n",
      "246 APH\n",
      "247 RFX\n",
      "248 JAUB\n",
      "249 KVUC\n",
      "250 OHAC\n",
      "251 VTUB\n",
      "252 CIBB\n",
      "253 AHS\n",
      "254 BLP\n",
      "255 RDAC\n",
      "256 LFJ\n",
      "257 ZWAC\n",
      "258 AJXB\n",
      "259 KKOD\n",
      "260 WGP\n",
      "261 DPXB\n",
      "262 WZHB\n",
      "263 OLK\n",
      "264 MTCA\n",
      "265 AKN\n",
      "266 RVC\n",
      "267 WKH\n",
      "268 YXWC\n",
      "269 XGEC\n",
      "270 BEZ\n",
      "271 RMEA\n",
      "272 RV\n",
      "273 ZDTB\n",
      "274 NNHD\n",
      "275 PJYB\n",
      "276 SKO\n",
      "277 QXKB\n",
      "278 KCTC\n",
      "279 LCMB\n",
      "280 RHUC\n",
      "281 KWJA\n",
      "282 QUK\n",
      "283 EVR\n",
      "284 DXED\n",
      "285 OEKC\n",
      "286 YLKB\n",
      "287 RDB\n",
      "288 IPT\n",
      "289 IPL\n",
      "290 EHSA\n",
      "291 PNIB\n",
      "292 CWGA\n",
      "293 TABC\n",
      "294 PBB\n",
      "295 DVB\n",
      "296 FIRB\n",
      "297 YMXA\n",
      "298 BOSA\n",
      "299 SWUC\n",
      "300 WIK\n",
      "301 OUSB\n",
      "302 NMMC\n",
      "303 DJIA\n",
      "304 FYBD\n",
      "305 RYRA\n",
      "306 KNZ\n",
      "307 IKQC\n",
      "308 XQZ\n",
      "309 RJB\n",
      "310 FYRC\n",
      "311 JZUC\n",
      "312 MPV\n",
      "313 UWHA\n",
      "314 EMAC\n",
      "315 OSYB\n",
      "316 LQGC\n",
      "317 HMUB\n",
      "318 MKBB\n",
      "319 HIAA\n",
      "320 RHQ\n",
      "321 QAAC\n",
      "322 EYK\n",
      "323 FYQD\n",
      "324 GZUA\n",
      "325 EUDD\n",
      "326 URF\n",
      "327 RUTB\n",
      "328 JCTC\n",
      "329 MKKD\n",
      "330 NRHA\n",
      "331 WQTB\n",
      "332 PUQ\n",
      "333 JSKC\n",
      "334 NCK\n",
      "335 FAL\n",
      "336 QRRA\n",
      "337 IAYB\n",
      "338 JHJA\n",
      "(6753, 23)\n"
     ]
    }
   ],
   "source": [
    "df_filtered = pd.DataFrame()\n",
    "\n",
    "cnt = 0\n",
    "for cid in valid_cnts:\n",
    "    print(cnt, cid)\n",
    "    cnt += 1\n",
    "    this_df = df[df[\"case:concept:name\"] == cid]\n",
    "    df_filtered = pd.concat([df_filtered, this_df])\n",
    "print(df_filtered.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_f = df_filtered[[\"time:timestamp\", \"concept:name\", \"case:concept:name\"]]\n",
    "df_f.to_csv(\"../data/filtered_data/Hospital Billing/filtered_data.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "(6753, 3)\n"
     ]
    }
   ],
   "source": [
    "from data_cleaner.data_preprocess import HospitalBillingPreprocess\n",
    "\n",
    "df = HospitalBillingPreprocess().preprocess()\n",
    "print(df.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "jupyter": {
     "source_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train_data_size:  237 , test_data_size:  102\n"
     ]
    }
   ],
   "source": [
    "def prepare_train_data(data_processed):\n",
    "    all_case_ids = set(data_processed['case:concept:name'].values)\n",
    "    num_trainset = len(all_case_ids) * 7 // 10\n",
    "    random.seed(0)\n",
    "    train_cids = random.sample(all_case_ids, num_trainset)\n",
    "    train_cids_set = set(train_cids)\n",
    "    test_cids_set = all_case_ids - train_cids_set\n",
    "    print('train_data_size: ', len(train_cids_set), ', test_data_size: ', len(test_cids_set))\n",
    "\n",
    "    activity_names = np.array(data_processed['concept:name'].values)\n",
    "    activity_names = np.reshape(activity_names, (activity_names.shape[0], 1))\n",
    "    ohe_act = OneHotEncoder(sparse=False)\n",
    "    ohe_act.fit(activity_names)\n",
    "\n",
    "    def generate_data(cid_set):\n",
    "        data_set = []\n",
    "        trace_length = 0\n",
    "        for cid in cid_set:\n",
    "            thisdf = data_processed[data_processed['case:concept:name'] == cid]\n",
    "            trace_length = max(trace_length, thisdf.shape[0])\n",
    "            tmpdata = []\n",
    "            \n",
    "            start_time = datetime.datetime.strptime(thisdf.iloc[0]['time:timestamp_short'], '%Y-%m-%d %H:%M:%S')\n",
    "            end_time = datetime.datetime.strptime(thisdf.iloc[-1]['time:timestamp_short'], '%Y-%m-%d %H:%M:%S')\n",
    "            last_time = start_time\n",
    "\n",
    "            for i in range(thisdf.shape[0]):\n",
    "                row = [int(thisdf.iloc[i]['case:concept:name'])]\n",
    "                \n",
    "                event_dt = datetime.datetime.strptime(thisdf.iloc[i]['time:timestamp_short'], '%Y-%m-%d %H:%M:%S')\n",
    "                midnight_time = event_dt.replace(hour=0, minute=0, second=0, microsecond=0)\n",
    "                \n",
    "                activity_name = np.array(thisdf.iloc[i]['concept:name'])\n",
    "                activity_name = np.reshape(activity_name, (-1, 1))\n",
    "                \n",
    "                row.extend(ohe_act.transform(activity_name).tolist()[0])  # 活动名称one-hot编码\n",
    "                \n",
    "                row.append((event_dt - start_time).total_seconds()/3600/24)  # 总花费时间\n",
    "                row.append((event_dt - last_time).total_seconds()/3600/24)  # 相比上次活动花费时间\n",
    "                last_time = event_dt\n",
    "                row.append((event_dt - midnight_time).total_seconds()/3600/24)  # 距午夜时间\n",
    "                \n",
    "                row.append((end_time - event_dt).total_seconds()/3600/24)\n",
    "\n",
    "                tmpdata.append(row)\n",
    "                if i != thisdf.shape[0] - 1:\n",
    "                    data_set.append(tmpdata.copy())\n",
    "        return data_set, trace_length\n",
    "\n",
    "    train_data_set, max_train_trace_length = generate_data(train_cids_set)\n",
    "    test_data_set, max_test_trace_length = generate_data(test_cids_set)\n",
    "\n",
    "    min_value = [1e20] * (len(train_data_set[0][0]) - 1)\n",
    "    max_value = [-1] * (len(train_data_set[0][0]) - 1)\n",
    "    for element in train_data_set:\n",
    "        for row in element:\n",
    "            for i in range(1, len(row)-1):\n",
    "                min_value[i] = min(min_value[i], row[i])\n",
    "                max_value[i] = max(max_value[i], row[i])\n",
    "    train_data_new = []\n",
    "    for i in range(len(train_data_set)):\n",
    "        seq = []\n",
    "        for j in range(len(train_data_set[i])):\n",
    "            row = [train_data_set[i][j][0]]\n",
    "            for k in range(1, len(train_data_set[i][j])-1):\n",
    "                if max_value[k] == min_value[k]:\n",
    "                    row.append(train_data_set[i][j][k])\n",
    "                else:\n",
    "                    row.append((train_data_set[i][j][k]-min_value[k]) / (max_value[k]-min_value[k]))\n",
    "            row.append(train_data_set[i][j][-1])\n",
    "            seq.append(row)\n",
    "        train_data_new.append(seq)\n",
    "    test_data_new = []\n",
    "    for i in range(len(test_data_set)):\n",
    "        seq = []\n",
    "        for j in range(len(test_data_set[i])):\n",
    "            row = [test_data_set[i][j][0]]\n",
    "            for k in range(1, len(test_data_set[i][j])-1):\n",
    "                if max_value[k] == min_value[k]:\n",
    "                    row.append(test_data_set[i][j][k])\n",
    "                else:\n",
    "                    row.append((test_data_set[i][j][k]-min_value[k]) / (max_value[k]-min_value[k]))\n",
    "            row.append(test_data_set[i][j][-1])\n",
    "            seq.append(row)\n",
    "        test_data_new.append(seq)\n",
    "\n",
    "    train_data_set = sequence.pad_sequences(train_data_new, maxlen=20, dtype='float64')\n",
    "    test_data_set = sequence.pad_sequences(test_data_new, maxlen=20, dtype='float64')\n",
    "    return train_data_set, test_data_set\n",
    "\n",
    "train_data_set, test_data_set = prepare_train_data(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.save(\"../data/filtered_data/Hospital Billing/train_data.npy\", train_data_set)\n",
    "np.save(\"../data/filtered_data/Hospital Billing/test_data.npy\", test_data_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   case:concept:name  concept:name time:timestamp_short  time_spent  \\\n",
      "0         1643520001           NEW  2013-02-10 22:03:38    0.000000   \n",
      "1         1643520001  CHANGE DIAGN  2013-02-10 23:56:55    0.078669   \n",
      "2         1643520001  CHANGE DIAGN  2013-03-25 11:12:00   42.547477   \n",
      "3         1643520001           FIN  2013-03-27 19:20:30   44.886713   \n",
      "4         1643520001       RELEASE  2013-03-28 02:38:12   45.190671   \n",
      "\n",
      "   total_time_pred  total_time_true  \n",
      "0       361.766907       266.592083  \n",
      "1       236.769221       266.592083  \n",
      "2       250.947028       266.592083  \n",
      "3       161.782457       266.592083  \n",
      "4       143.420919       266.592083  \n"
     ]
    }
   ],
   "source": [
    "data_path = \"../data/drl_data/hospital_billing/hospital_billing_wp_nn.csv\"\n",
    "\n",
    "df = pd.read_csv(data_path)\n",
    "print(df.head(5))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### case 1: 抽样case id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "339 339\n",
      "15 217 0.02831018518518518 899.0078819444444\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAsYAAAF1CAYAAADr3izzAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAg8UlEQVR4nO3df5RkZX3n8ffHGQRRNkJo2JFBG82EE8g5AtuHmJAYIyqIypjdmDMkesjGZLJZzGpW1wyak+iekCWJmmTPibqjEllDwInKShyTlRBdwyaKDRmUYSCMMkLDONP4CzQuyvDdP+q2Kdr+UV0/u2ver3Pq1K3nPlX3+9yuuv3p209VpaqQJEmSDnePG3UBkiRJ0mpgMJYkSZIwGEuSJEmAwViSJEkCDMaSJEkSYDCWJEmSAIOxxkSSfUmeN4LtTiapJOuHvW1J0sKSvDfJ74y6Dq09BmN1bVRhdJQOxzFLEnj80+HBYKyB8SyqJB0+POZrHBiM1ZUk7wOeCvxlkm8keX3btIJXJrkH+Num718k+VKSryf5ZJLT2x7nCUnemuSLzfobkzyhWfesJH+f5GtJbk3ynA5re1ySbUk+n+TLSXYkOa5ZN1fjxUnuSfJAkjfOq+fKJF9NsqcZ18xiY27b7M8v9HiSNA46PeYnec7cMbPtvt8907zU8XmBbe5J8uK22+ubY+xZze1Ff7fMe5xfSHLjvLZK8gPN8pFJ3tIcww8keWfb76Hjk3yk+T30lSR/l8TsNMb84aorVfUK4B7gJVX1pKr6/bbVPwn8EHBec/uvgE3ACcAtwFVtfd8C/Bvgx4DjgNcDjyY5CdgJ/E7T/jrgg0kmOijvPwEvbep4CvBV4E/m9flx4FTgXOC3kvxQ0/7bwCTwdOD5wMs7HPNijydJa94Kj/lL6eT4POdq4KK22+cBD1TVLc3tpX63rMTvAT8InAH8AHAS8FvNutcCM8AEcCLwBqC63I7WAIOxBuFNVfXNqvoWQFVdUVUPVdXDwJuAZyb5vuav7l8EXl1V91XVoar6+6bfy4GPVtVHq+rRqroemAYu6GD7vwK8sapm2rb5M/P+zffmqvpWVd0K3Ao8s2n/WeB3q+qrVTUD/PcOx7zY40nSuHvMMX8ZnRyf5/w5cGGSo5vbP9e0AYv/bllJ4UkC/DLw61X1lap6CPhdYEvT5TvABuBpVfWdqvq7qjIYjzHnA2kQ7p1bSLIOuAx4Ga2/uB9tVh0PHAkcBXx+gcd4GvCyJC9pazsC+HgH238acG2SR9vaDtH6a3/Ol9qW/xl4UrP8lPb65y0vZbHHk6Rx1+lxEpY+Pt/X3rGq9ibZA7wkyV8CFwJnwrK/W76+gnomgKOBm1sZGYAA65rlP6AVuj/WrN9eVZev4PG1xhiM1YvF/mpub/85YDPwPGAf8H20/nUW4AHg/wHPoHWWtd29wPuq6pe7qOte4Ber6v/OX5Fkcpn77gc2Arc3t0+et94zBZIOV50c879JK2gC3w2w7VPgFj0+L2JuOsXjgNuram/TvtTvlvnm1/Sv29Y9AHwLOL2q7pt/x+YM8muB1zZzmD+e5DNVdUOH9WuNcSqFenGA1lzcpRwDPAx8mdaB6XfnVlTVo8AVwNuSPCXJuiQ/muRI4M9onSU4r2k/qnlTx8YO6noncFmSpwEkmUiyucMx7QAuTXJsM8/5VfPWdzJmSRpHnRz//gk4KsmLkhwB/Cat/w7OWenx+RrgBcCv0jaNgiV+tyzgVuD0JGckOYrWGWDgu7+H3gX8YZITmppOSnJes/ziJD/QTLl4kNbZ7UPL7AOtYQZj9eK/Ab/ZvFv3dYv0+Z/AF2n9i+x24FPz1r8O+BzwGeArtN4E8biqupfW2YA3ALO0zjL8Fzp7zv4xcB2tf3091GzzRzoc03+l9UaLu4G/AT5A6+A7p5MxS9I4Wvb4V1VfB/4j8G5ax/1v0jqmzlnR8bmq9gP/QOsN2u9vW7Xc75b2x/gnWsf2vwHuAm6c1+U3gL3Ap5I82PQ7tVm3qbn9jaaOt1fVJxbblta+OIdcWlySXwW2VNVPjroWSZI0WJ4xltok2ZDknOazNk+lNbfs2lHXJUmSBs8330mP9XjgfwCnAF+jNb/t7aMsSJIkDYdTKSRJkiScSiFJkiQBBmNJkiQJWCVzjI8//vianJwcdRmSDnM333zzA1U1sXxPdcNjvaTVYKlj/aoIxpOTk0xPT4+6DEmHuSRfHHUN48xjvaTVYKljvVMpJEmSJAzGkiRJEmAwliRJkgCDsSRJkgQYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkiRJgMFYkiRJAgzGkiRJEmAwliR1IcmpSXa1XR5M8ppR1yVJvVg/6gIkSWtPVd0JnAGQZB1wH3DtKGuSpF55xliS1Ktzgc9X1RdHXYgk9WJNnzGe3LZz2T77Ln/RECqRpMPaFuDqhVYk2QpsBXjqU586zJoOG5Pbdvq7TuoTzxhLkrqW5PHAhcBfLLS+qrZX1VRVTU1MTAy3OElaIYOxJKkXLwRuqaoDoy5Eknq1bDBOclSSm5LcmmR3kjc37ccluT7JXc31sW33uTTJ3iR3JjlvkAOQJI3URSwyjUKS1ppOzhg/DDy3qp5J6x3I5yd5FrANuKGqNgE3NLdJchqt+WanA+cDb2/esSxJGiNJjgaeD3xo1LVIUj8sG4yr5RvNzSOaSwGbgSub9iuBlzbLm4Frqurhqrob2Auc3c+iJUmjV1X/XFXfX1VfH3UtktQPHc0xTrIuyS7gIHB9VX0aOLGq9gM01yc03U8C7m27+0zTNv8xtyaZTjI9OzvbwxAkSZKk3nUUjKvqUFWdAWwEzk7yw0t0z0IPscBj+k5lSZIkrRor+lSKqvoa8Alac4cPJNkA0FwfbLrNACe33W0jcH+vhUqSJEmD1MmnUkwkeXKz/ATgecAdwHXAxU23i4EPN8vXAVuSHJnkFGATcFOf65YkSZL6qpNvvtsAXNl8ssTjgB1V9ZEk/wDsSPJK4B7gZQBVtTvJDuB24BHgkqo6NJjyJUmSpP5YNhhX1WeBMxdo/zJw7iL3uQy4rOfqJEmSpCHxm+8kSZIkDMaSJEkSYDCWJEmSAIOxJEmSBBiMJUmSJMBgLEmSJAEGY0mSJAkwGEuSJEmAwViSJEkCDMaSJEkSYDCWJEmSAIOxJEmSBBiMJUmSJMBgLEmSJAEGY0mSJAkwGEuSJEmAwViSJEkCDMaSJEkSYDCWJEmSAIOxJEmSBBiMJUmSJMBgLEmSJAEGY0mSJAkwGEuSJEmAwViSJEkCDMaSJEkSYDCWJEmSAIOxJEmSBBiMJUmSJMBgLEnqUpInJ/lAkjuS7Enyo6OuSZJ6sX7UBUiS1qw/Bv66qn4myeOBo0ddkCT1wmAsSVqxJP8KeDbwCwBV9W3g26OsSZJ65VQKSVI3ng7MAn+a5B+TvDvJE0ddlCT1wmAsSerGeuAs4B1VdSbwTWDb/E5JtiaZTjI9Ozs77Br7bnLbzlGXIGmADMaSpG7MADNV9enm9gdoBeXHqKrtVTVVVVMTExNDLVCSVspgLElasar6EnBvklObpnOB20dYkiT1zDffSZK69WvAVc0nUnwB+PcjrkeSemIwliR1pap2AVOjrkOS+sWpFJIkSRIGY0mSJAkwGEuSJEmAwViSJEkCOgjGSU5O8vEke5LsTvLqpv1NSe5Lsqu5XNB2n0uT7E1yZ5LzBjkASZIkqR86+VSKR4DXVtUtSY4Bbk5yfbPuD6vqLe2dk5wGbAFOB54C/E2SH6yqQ/0sXJIkSeqnZc8YV9X+qrqlWX4I2AOctMRdNgPXVNXDVXU3sBc4ux/FSpIkSYOyojnGSSaBM4G5rwB9VZLPJrkiybFN20nAvW13m2GBIJ1ka5LpJNOzs7Mrr1ySJEnqo46DcZInAR8EXlNVDwLvAJ4BnAHsB94613WBu9f3NFRtr6qpqpqamJhYad2SJElSX3UUjJMcQSsUX1VVHwKoqgNVdaiqHgXexb9Ml5gBTm67+0bg/v6VLEmSJPVfJ59KEeA9wJ6qeltb+4a2bj8N3NYsXwdsSXJkklOATcBN/StZkiRJ6r9OPpXiHOAVwOeS7Gra3gBclOQMWtMk9gG/AlBVu5PsAG6n9YkWl/iJFJIkSVrtlg3GVXUjC88b/ugS97kMuKyHuiRJkqSh8pvvJEmSJAzGkiRJEmAwliRJkgCDsSRJkgQYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkiRJgMFYkiRJAgzGkiRJEmAwliRJkgCDsSRJkgQYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkiRJgMFYkiRJAgzGkiRJEmAwliRJkgCDsSRJkgQYjCVJkiTAYCxJkiQBsH7UBUiS1qYk+4CHgEPAI1U1NdqKJKk3BmNJUi9+qqoeGHURktQPTqWQJEmSMBhLkrpXwMeS3Jxk66iLkaReGYwlSd06p6rOAl4IXJLk2fM7JNmaZDrJ9Ozs7PArHLDJbTv72k/SaBmMJUldqar7m+uDwLXA2Qv02V5VU1U1NTExMewSJWlFDMaSpBVL8sQkx8wtAy8AbhttVZLUGz+VQpLUjROBa5NA63fJn1fVX4+2JEnqjcFYkrRiVfUF4JmjrkOS+smpFJIkSRIGY0mSJAkwGEuSJEmAwViSJEkCDMaSJEkSYDCWJEmSAIOxJEmSBBiMJUmSJMBgLEmSJAEGY0mSJAnoIBgnOTnJx5PsSbI7yaub9uOSXJ/krub62Lb7XJpkb5I7k5w3yAFIkiRJ/dDJGeNHgNdW1Q8BzwIuSXIasA24oao2ATc0t2nWbQFOB84H3p5k3SCKlyRJkvpl2WBcVfur6pZm+SFgD3ASsBm4sul2JfDSZnkzcE1VPVxVdwN7gbP7XLckSZLUVyuaY5xkEjgT+DRwYlXth1Z4Bk5oup0E3Nt2t5mmTZIkSVq1Og7GSZ4EfBB4TVU9uFTXBdpqgcfbmmQ6yfTs7GynZUiSJEkD0VEwTnIErVB8VVV9qGk+kGRDs34DcLBpnwFObrv7RuD++Y9ZVduraqqqpiYmJrqtX5IkSeqLTj6VIsB7gD1V9ba2VdcBFzfLFwMfbmvfkuTIJKcAm4Cb+leyJEmS1H/rO+hzDvAK4HNJdjVtbwAuB3YkeSVwD/AygKranWQHcDutT7S4pKoO9btwSZIkqZ+WDcZVdSMLzxsGOHeR+1wGXNZDXZIkSdJQ+c13kiRJEgZjSZIkCTAYS5IkSYDBWJIkSQIMxpIkSRJgMJYkSZIAg7EkSZIEGIwlSZIkwGAsSZIkAQZjSZIkCTAYS5IkSYDBWJIkSQIMxpIkSRJgMJYkSZIAg7EkSZIEGIwlSZIkwGAsSZIkAQZjSVIPkqxL8o9JPjLqWiSpVwZjSVIvXg3sGXURktQPBmNJUleSbAReBLx71LVIUj8YjCVJ3foj4PXAo4t1SLI1yXSS6dnZ2a42MrltZ3fVdanT7S3Ub3Lbzse0L7bci9X2ONI4MRhLklYsyYuBg1V181L9qmp7VU1V1dTExMSQqpOk7hiMJUndOAe4MMk+4BrguUn+bLQlSVJvDMaSpBWrqkuramNVTQJbgL+tqpePuCxJ6onBWJIkSQLWj7oASdLaVlWfAD4x4jIkqWeeMZYkSZIwGEuSJEmAwViSJEkCDMaSJEkSYDCWJEmSAIOxJEmSBBiMJUmSJMBgLEmSJAEGY0mSJAkwGEuSJEmAwViSJEkCDMaSJEkSYDCWJEmSAIOxJEmSBBiMJUmSJMBgLEmSJAEdBOMkVyQ5mOS2trY3Jbkvya7mckHbukuT7E1yZ5LzBlW4JEmS1E+dnDF+L3D+Au1/WFVnNJePAiQ5DdgCnN7c5+1J1vWrWEmSJGlQlg3GVfVJ4CsdPt5m4Jqqeriq7gb2Amf3UJ8kSZI0FL3MMX5Vks82Uy2ObdpOAu5t6zPTtEmSJEmrWrfB+B3AM4AzgP3AW5v2LNC3FnqAJFuTTCeZnp2d7bIMSZIkqT+6CsZVdaCqDlXVo8C7+JfpEjPAyW1dNwL3L/IY26tqqqqmJiYmuilDkiRJ6puugnGSDW03fxqY+8SK64AtSY5McgqwCbiptxIlSZKkwVu/XIckVwPPAY5PMgP8NvCcJGfQmiaxD/gVgKranWQHcDvwCHBJVR0aSOWSJElSHy0bjKvqogWa37NE/8uAy3opSpIkSRo2v/lOkiRJwmAsSZIkAQZjSZIkCTAYS5IkSYDBWJIkSQIMxpIkSRJgMJYkSZIAg7EkSZIEGIwlSZIkwGAsSZIkAQZjSVIXkhyV5KYktybZneTNo65Jknq1ftQFSJLWpIeB51bVN5IcAdyY5K+q6lOjLkySumUwliStWFUV8I3m5hHNpUZXkST1zqkUkqSuJFmXZBdwELi+qj494pIkqScGY0lSV6rqUFWdAWwEzk7yw/P7JNmaZDrJ9Ozs7Iq3MbltZ++FdmFuu+3X7bXMr6vXOvsxzqXqG4SF9sH8/bZU/2620Y/HlJZiMJYk9aSqvgZ8Ajh/gXXbq2qqqqYmJiaGXZokrYjBWJK0Ykkmkjy5WX4C8DzgjpEWJUk98s13kqRubACuTLKO1kmWHVX1kRHXJEk9MRhLklasqj4LnDnqOiSpn5xKIUmSJGEwliRJkgCDsSRJkgQYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkiRJgMFYkiRJAgzGkiRJEmAwliRJkgCDsSRJkgQYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkiRJgMFYkiRJAgzGkiRJEmAwliRJkgCDsSRJkgQYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkiRJQAfBOMkVSQ4mua2t7bgk1ye5q7k+tm3dpUn2JrkzyXmDKlySJEnqp07OGL8XOH9e2zbghqraBNzQ3CbJacAW4PTmPm9Psq5v1UqSJEkDsmwwrqpPAl+Z17wZuLJZvhJ4aVv7NVX1cFXdDewFzu5PqZIkSdLgdDvH+MSq2g/QXJ/QtJ8E3NvWb6Zp+x5JtiaZTjI9OzvbZRmSJElSf/T7zXdZoK0W6lhV26tqqqqmJiYm+lyGJEmStDLdBuMDSTYANNcHm/YZ4OS2fhuB+7svT5IkSRqOboPxdcDFzfLFwIfb2rckOTLJKcAm4KbeSpQkSZIGb/1yHZJcDTwHOD7JDPDbwOXAjiSvBO4BXgZQVbuT7ABuBx4BLqmqQwOqXZIkSeqbZYNxVV20yKpzF+l/GXBZL0VJkiRJw+Y330mSJEkYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkrqQ5OQkH0+yJ8nuJK8edU2S1KtlP5VCkqQFPAK8tqpuSXIMcHOS66vq9lEXJknd8oyxJGnFqmp/Vd3SLD8E7AFOGm1VktQbg7EkqSdJJoEzgU8vsG5rkukk07Ozs0OvrVOT23Z2ta59fXu/xe6zUN/56+evW277S9W1XB2dPEY3213oerHHa++31Pr2PvP7LrdfO13X7b7u1+P2e/udbmuY213tDMaSpK4leRLwQeA1VfXg/PVVtb2qpqpqamJiYvgFStIKGIwlSV1JcgStUHxVVX1o1PVIUq8MxpKkFUsS4D3Anqp626jrkaR+MBhLkrpxDvAK4LlJdjWXC0ZdlCT1wo9rkyStWFXdCGTUdUhSP3nGWJIkScJgLEmSJAEGY0mSJAkwGEuSJEmAwViSJEkCDMaSJEkSYDCWJEmSAIOxJEmSBBiMJUmSJMBgLEmSJAEGY0mSJAkwGEuSJEmAwViSJEkCDMaSJEkSYDCWJEmSAIOxJEmSBBiMJUmSJMBgLEmSJAEGY0mSJAkwGEuSJEmAwViSJEkCDMaSJEkSYDCWJEmSAIOxJEmSBBiMJUmSJMBgLEmSJAEGY0mSJAkwGEuSJEmAwViSJEkCYH0vd06yD3gIOAQ8UlVTSY4D3g9MAvuAn62qr/ZWpiRJkjRY/Thj/FNVdUZVTTW3twE3VNUm4IbmtiRJkrSqDWIqxWbgymb5SuClA9iGJEmS1Fe9BuMCPpbk5iRbm7YTq2o/QHN9wkJ3TLI1yXSS6dnZ2R7LkCRJknrT0xxj4Jyquj/JCcD1Se7o9I5VtR3YDjA1NVU91iFJkiT1pKczxlV1f3N9ELgWOBs4kGQDQHN9sNciJUmSpEHrOhgneWKSY+aWgRcAtwHXARc33S4GPtxrkZKk1SfJFUkOJrlt1LVIUj/0csb4RODGJLcCNwE7q+qvgcuB5ye5C3h+c1uSNH7eC5w/6iIkqV+6nmNcVV8AnrlA+5eBc3spSpK0+lXVJ5NMjroOSeoXv/lOkjQw/fwEosltO797PXdZbF1720LLC/Wb/5jL1bGSepfqs9D6+W0L1dzJ/Ra6/2L7ZbFtLTemTvfHUnUu9xjL7cPFalpqLCvZxnLbW2z789cvtf+73d/d7P9O7t9pe6/bX+6xOvnZ95PBWJI0MFW1vaqmqmpqYmJi1OVI0pIMxpIkSRIGY0mSJAkwGEuSupTkauAfgFOTzCR55ahrkqRe9PrNd5Kkw1RVXTTqGiSpnzxjLEmSJGEwliRJkgCDsSRJkgQYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkiRJgMFYkiRJAgzGkiRJEmAwliRJkgCDsSRJkgQYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkiRJgMFYkiRJAgzGkiRJEmAwliRJkgCDsSRJkgQYjCVJkiQA1o+6gEGb3LZz2T77Ln/RECqRJEnSauYZY0mSJAmDsSRJkgQYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkiRJgMFYkiRJAgzGkiRJEmAwliRJkgCDsSRJkgQYjCVJkiTAYCxJkiQBBmNJkiQJgPWjLmA1mNy2s6N++y5/0YArkSRJ0qgM7IxxkvOT3Jlkb5Jtg9qOJGk0PM5LGjcDOWOcZB3wJ8DzgRngM0muq6rbB7G9YenkzLJnlSUdDsb1OC/p8DaoM8ZnA3ur6gtV9W3gGmDzgLYlSRo+j/OSxs6g5hifBNzbdnsG+JEBbWtV6XS+cic8+7wya/mM/ijmuY/7/lqttY+Rw/Y4L2l8par6/6DJy4DzquqXmtuvAM6uql9r67MV2NrcPBW4s++FLOx44IEhbWs1bt8aVk8No96+NXxvDU+rqokR17ImdHKcb9p7PdavhufHsDjW8eRYV59Fj/WDOmM8A5zcdnsjcH97h6raDmwf0PYXlWS6qqaGvd3Vsn1rWD01jHr71rC6aliDlj3OQ+/H+sPpZ+NYx5NjXVsGNcf4M8CmJKckeTywBbhuQNuSJA2fx3lJY2cgZ4yr6pEkrwL+N7AOuKKqdg9iW5Kk4fM4L2kcDewLPqrqo8BHB/X4PRj69I1Vtn2whjmjrmHU2wdrmLMaalhzhnScP5x+No51PDnWNWQgb76TJEmS1pqBffOdJEmStJaMbTBOcnKSjyfZk2R3klc37W9Kcl+SXc3lggHXsS/J55ptTTdtxyW5PsldzfWxA9z+qW1j3ZXkwSSvGeR+SHJFkoNJbmtrW3TMSS5tvlL2ziTnDbCGP0hyR5LPJrk2yZOb9skk32rbF+8cYA2L7vch7of3t21/X5JdTXvf98MSr8OhPR+WqGGozwetTMbs66ZXw2th2JKsS/KPST7S3B7LsSZ5cpIPNMeTPUl+dIzH+uvN8/e2JFcnOWrsxlpVY3kBNgBnNcvHAP8EnAa8CXjdEOvYBxw/r+33gW3N8jbg94ZUyzrgS8DTBrkfgGcDZwG3LTfm5mdyK3AkcArweWDdgGp4AbC+Wf69thom2/sNeD8suN+HuR/mrX8r8FuD2g9LvA6H9nxYooahPh+8rOhntq752T8deHzznDht1HX1OKaRvxZGMOb/DPw58JHm9liOFbgS+KVm+fHAk8dxrLS+1Odu4AnN7R3AL4zbWMf2jHFV7a+qW5rlh4A9tH6oq8FmWi8kmuuXDmm75wKfr6ovDnIjVfVJ4Cvzmhcb82bgmqp6uKruBvbS+qrZvtdQVR+rqkeam5+i9bmrA7PIfljM0PbDnCQBfha4utftLLH9xV6HQ3s+LFbDsJ8PWpGx+7rp1fBaGKYkG4EXAe9uax67sSb5V7ROQLwHoKq+XVVfYwzH2lgPPCHJeuBoWp9dPlZjHdtg3C7JJHAm8Omm6VXNv0+vyACnMTQK+FiSm9P6BiiAE6tqP7QOlsAJA65hzhYeG4KGuR8WG/NCXys7jD9gfhH4q7bbpzT/8vs/SX5iwNteaL+PYj/8BHCgqu5qaxvYfpj3OhzJ82GBY8GcUT4f9L1GdVwYitXwWhiCPwJeDzza1jaOY306MAv8aXPMeHeSJzKGY62q+4C3APcA+4GvV9XHGLOxjn0wTvIk4IPAa6rqQeAdwDOAM2j9YN864BLOqaqzgBcClyR59oC3t6C0PoD/QuAvmqZh74fFZIG2gX5USpI3Ao8AVzVN+4GnVtWZNP/6a84CDMJi+33o+wG4iMf+oTSw/bDA63DRrgu09WU/LFbDiJ8PWtgoXg9DsRpeC4OW5MXAwaq6udO7LNC2JsZK6wzqWcA7mmPGN2lNJ1jMmh1rcyJnM61pEU8Bnpjk5UvdZYG2VT/WsQ7GSY6gdQC6qqo+BFBVB6rqUFU9CryLAZ/Wr6r7m+uDwLXN9g4k2dDUuAE4OMgaGi8EbqmqA009Q90PLD7mjr5Wtl+SXAy8GPj5aiZBNf/m+XKzfDOteVA/OIjtL7Hfh70f1gP/Fnh/W20D2Q8LvQ4Z8vNhkRpG/nzQoob6ehiW1fBaGJJzgAuT7KM1Dea5Sf6M8RzrDDBTVXP/hfoAraA8jmN9HnB3Vc1W1XeADwE/xpiNdWyDcTN/8j3Anqp6W1v7hrZuPw3cNv++fazhiUmOmVum9Waf22h9berFTbeLgQ8PqoY2jzk7OMz90FhszNcBW5IcmeQUYBNw0yAKSHI+8BvAhVX1z23tE0nWNctPb2r4woBqWGy/D20/NJ4H3FFVM2219X0/LPY6ZIjPhyWOBSN/PmhRY/d106vhtTAsVXVpVW2sqklaP7u/raqXM55j/RJwb5JTm6ZzgdsZw7HSmkLxrCRHN8/nc2nNlR+vsfbjHXyr8QL8OK1T9p8FdjWXC4D3AZ9r2q8DNgywhqfTekfmrcBu4I1N+/cDNwB3NdfHDXhfHA18Gfi+traB7QdaAXw/8B1afzG+cqkxA2+kdVbuTuCFA6xhL635TnPPh3c2ff9d8/O5FbgFeMkAa1h0vw9rPzTt7wX+w7y+fd8PS7wOh/Z8WKKGoT4fvKz453YBrU9u+DzNsXMtX1bDa2FE434O//KpFGM5VlpT46abn+3/Ao4d47G+GbiD1kmd99H6xImxGqvffCdJkiQxxlMpJEmSpJUwGEuSJEkYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkiRJgMFYkiRJAgzGkiRJEgD/H17x9KwszfmdAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 864x432 with 2 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "display_dataset(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "324\n",
      "200 50\n",
      "(4432, 6)\n"
     ]
    }
   ],
   "source": [
    "filtered_cids = []\n",
    "for cid in list(set(df[\"case:concept:name\"].values)):\n",
    "    this_df = df[df[\"case:concept:name\"]==cid]\n",
    "    true_value = this_df.iloc[0]['total_time_true']\n",
    "    if true_value >= 30:\n",
    "        if this_df.shape[0] >= 15 and this_df.shape[0] <= 50:\n",
    "            filtered_cids.append(cid)\n",
    "print(len(filtered_cids))\n",
    "\n",
    "df_samples, train_cids, test_cids = sample_data(df, filtered_cids, 200, 50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "ERROR:root:[Errno 2] No such file or directory: '/Users/wangnaixuan/Documents/GitHub/prediction_service/data/drl_data/hospital_billing/hospital_billing_sample250.csv': read sample file failed.\n"
     ]
    }
   ],
   "source": [
    "from config import DeepReinforceLearningParameters\n",
    "\n",
    "save_path = DeepReinforceLearningParameters(dataset_name=\"hospital_billing\").DATA_PATH\n",
    "\n",
    "df_samples.to_csv(save_path+\"/hospital_billing_sample250.csv\", index=False)\n",
    "np.save(save_path+\"/train_case_ids.npy\", np.array(train_cids))\n",
    "np.save(save_path+\"/test_case_ids.npy\", np.array(test_cids))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### case 2: 读取case id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "train_cids = np.load(\"../data/drl_data/hospital_billing/train_case_ids.npy\")\n",
    "test_cids = np.load(\"../data/drl_data/hospital_billing/test_case_ids.npy\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(4432, 6)\n"
     ]
    }
   ],
   "source": [
    "df_sample = pd.DataFrame()\n",
    "for cid in set(df[\"case:concept:name\"]):\n",
    "    if cid in train_cids or cid in test_cids:\n",
    "        df_sample = df_sample.append(df[df[\"case:concept:name\"] == cid])\n",
    "print(df_sample.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "from config import DRLParameters\n",
    "\n",
    "save_path = DRLParameters(dataset_name=\"hospital_billing\").DATA_PATH\n",
    "\n",
    "df_sample.to_csv(save_path+\"/hospital_billing_nn_sample250.csv\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### BPIC 2015数据集处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3f011508a919426f81e762461f2bfdf8",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(HTML(value='parsing log, completed traces :: '), FloatProgress(value=0.0, max=1199.0), HTML(val…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "0\n",
      "1000\n",
      "0\n",
      "10000\n",
      "20000\n",
      "30000\n",
      "40000\n",
      "50000\n",
      "(52196, 8)\n"
     ]
    }
   ],
   "source": [
    "from data_cleaner.data_preprocess import BPIC2015Preprocess\n",
    "\n",
    "df = BPIC2015Preprocess().preprocess()\n",
    "print(df.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "jupyter": {
     "source_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train_data_size:  834 , test_data_size:  358\n"
     ]
    }
   ],
   "source": [
    "def prepare_train_data(data_processed):\n",
    "    all_case_ids = set(data_processed['case:concept:name'].values)\n",
    "    num_trainset = len(all_case_ids) * 7 // 10\n",
    "    random.seed(0)\n",
    "    train_cids = random.sample(all_case_ids, num_trainset)\n",
    "    train_cids_set = set(train_cids)\n",
    "    test_cids_set = all_case_ids - train_cids_set\n",
    "    print('train_data_size: ', len(train_cids_set), ', test_data_size: ', len(test_cids_set))\n",
    "\n",
    "    activity_names = np.array(data_processed['concept:name'].values)\n",
    "    activity_names = np.reshape(activity_names, (activity_names.shape[0], 1))\n",
    "    ohe_act = OneHotEncoder(sparse=False)\n",
    "    ohe_act.fit(activity_names)\n",
    "    \n",
    "    resource_names = np.array(data_processed['org:resource'].values)\n",
    "    resource_names = np.reshape(resource_names, (resource_names.shape[0], 1))\n",
    "    ohe_rn = OneHotEncoder(sparse=False)\n",
    "    ohe_rn.fit(resource_names)\n",
    "    \n",
    "    monitoring_resource_names = np.array(data_processed['monitoringResource'].values)\n",
    "    monitoring_resource_names = np.reshape(monitoring_resource_names, (monitoring_resource_names.shape[0], 1))\n",
    "    ohe_mrn = OneHotEncoder(sparse=False)\n",
    "    ohe_mrn.fit(monitoring_resource_names)\n",
    "    \n",
    "    status_names = np.array(data_processed['case:caseStatus'].values)\n",
    "    status_names = np.reshape(status_names, (status_names.shape[0], 1))\n",
    "    ohe_sn = OneHotEncoder(sparse=False)\n",
    "    ohe_sn.fit(status_names)\n",
    "\n",
    "    last_phase_names = np.array(data_processed['case:last_phase'].values)\n",
    "    last_phase_names = np.reshape(last_phase_names, (last_phase_names.shape[0], 1))\n",
    "    ohe_lpn = OneHotEncoder(sparse=False)\n",
    "    ohe_lpn.fit(last_phase_names)\n",
    "    \n",
    "    request_complete_names = np.array(data_processed['case:requestComplete'].values)\n",
    "    request_complete_names = np.reshape(request_complete_names, (request_complete_names.shape[0], 1))\n",
    "    ohe_rcn = OneHotEncoder(sparse=False)\n",
    "    ohe_rcn.fit(request_complete_names)\n",
    "\n",
    "    def generate_data(cid_set):\n",
    "        data_set = []\n",
    "        trace_length = 0\n",
    "        for cid in cid_set:\n",
    "            thisdf = data_processed[data_processed['case:concept:name'] == cid]\n",
    "            trace_length = max(trace_length, thisdf.shape[0])\n",
    "            tmpdata = []\n",
    "            \n",
    "            start_time = datetime.datetime.strptime(thisdf.iloc[0]['time:timestamp_short'], '%Y-%m-%d %H:%M:%S')\n",
    "            end_time = datetime.datetime.strptime(thisdf.iloc[-1]['time:timestamp_short'], '%Y-%m-%d %H:%M:%S')\n",
    "            last_time = start_time\n",
    "\n",
    "            for i in range(thisdf.shape[0]):\n",
    "                row = [int(thisdf.iloc[i]['case:concept:name'])]\n",
    "                \n",
    "                event_dt = datetime.datetime.strptime(thisdf.iloc[i]['time:timestamp_short'], '%Y-%m-%d %H:%M:%S')\n",
    "                midnight_time = event_dt.replace(hour=0, minute=0, second=0, microsecond=0)\n",
    "                \n",
    "                activity_name = np.array(thisdf.iloc[i]['concept:name'])\n",
    "                activity_name = np.reshape(activity_name, (-1, 1))\n",
    "                \n",
    "                resource_name = np.array(thisdf.iloc[i]['org:resource'])\n",
    "                resource_name = np.reshape(resource_name, (-1, 1))\n",
    "                \n",
    "                monitoring_resource_name = np.array(thisdf.iloc[i]['monitoringResource'])\n",
    "                monitoring_resource_name = np.reshape(monitoring_resource_name, (-1, 1))\n",
    "                \n",
    "                status_name = np.array(thisdf.iloc[i]['case:caseStatus'])\n",
    "                status_name = np.reshape(status_name, (-1, 1))\n",
    "                \n",
    "                last_phase_name = np.array(thisdf.iloc[i]['case:last_phase'])\n",
    "                last_phase_name = np.reshape(last_phase_name, (-1, 1))\n",
    "\n",
    "                request_complete_name = np.array(thisdf.iloc[i]['case:requestComplete'])\n",
    "                request_complete_name = np.reshape(request_complete_name, (-1, 1))\n",
    "                \n",
    "                row.extend(ohe_rn.transform(resource_name).tolist()[0])\n",
    "                row.extend(ohe_sn.transform(status_name).tolist()[0])\n",
    "                row.extend(ohe_mrn.transform(monitoring_resource_name).tolist()[0])\n",
    "                row.extend(ohe_act.transform(activity_name).tolist()[0])  # 活动名称one-hot编码\n",
    "                row.extend(ohe_rcn.transform(request_complete_name).tolist()[0])\n",
    "                row.extend(ohe_lpn.transform(last_phase_name).tolist()[0])\n",
    "                \n",
    "                row.append((event_dt - start_time).total_seconds()/3600/24)  # 总花费时间\n",
    "                row.append((event_dt - last_time).total_seconds()/3600/24)  # 相比上次活动花费时间\n",
    "                last_time = event_dt\n",
    "                row.append((event_dt - midnight_time).total_seconds()/3600/24)  # 距午夜时间\n",
    "                \n",
    "                row.append((end_time - event_dt).total_seconds()/3600/24)\n",
    "\n",
    "                tmpdata.append(row)\n",
    "                if i != thisdf.shape[0] - 1:\n",
    "                    data_set.append(tmpdata.copy())\n",
    "        return data_set, trace_length\n",
    "\n",
    "    train_data_set, max_train_trace_length = generate_data(train_cids_set)\n",
    "    test_data_set, max_test_trace_length = generate_data(test_cids_set)\n",
    "\n",
    "    min_value = [1e20] * (len(train_data_set[0][0]) - 1)\n",
    "    max_value = [-1] * (len(train_data_set[0][0]) - 1)\n",
    "    for element in train_data_set:\n",
    "        for row in element:\n",
    "            for i in range(1, len(row)-1):\n",
    "                min_value[i] = min(min_value[i], row[i])\n",
    "                max_value[i] = max(max_value[i], row[i])\n",
    "    train_data_new = []\n",
    "    for i in range(len(train_data_set)):\n",
    "        seq = []\n",
    "        for j in range(len(train_data_set[i])):\n",
    "            row = [train_data_set[i][j][0]]\n",
    "            for k in range(1, len(train_data_set[i][j])-1):\n",
    "                if max_value[k] == min_value[k]:\n",
    "                    row.append(train_data_set[i][j][k])\n",
    "                else:\n",
    "                    row.append((train_data_set[i][j][k]-min_value[k]) / (max_value[k]-min_value[k]))\n",
    "            row.append(train_data_set[i][j][-1])\n",
    "            seq.append(row)\n",
    "        train_data_new.append(seq)\n",
    "    test_data_new = []\n",
    "    for i in range(len(test_data_set)):\n",
    "        seq = []\n",
    "        for j in range(len(test_data_set[i])):\n",
    "            row = [test_data_set[i][j][0]]\n",
    "            for k in range(1, len(test_data_set[i][j])-1):\n",
    "                if max_value[k] == min_value[k]:\n",
    "                    row.append(test_data_set[i][j][k])\n",
    "                else:\n",
    "                    row.append((test_data_set[i][j][k]-min_value[k]) / (max_value[k]-min_value[k]))\n",
    "            row.append(test_data_set[i][j][-1])\n",
    "            seq.append(row)\n",
    "        test_data_new.append(seq)\n",
    "\n",
    "    train_data_set = sequence.pad_sequences(train_data_new, maxlen=20, dtype='float64')\n",
    "    test_data_set = sequence.pad_sequences(test_data_new, maxlen=20, dtype='float64')\n",
    "    return train_data_set, test_data_set\n",
    "\n",
    "train_data_set, test_data_set = prepare_train_data(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.save(\"../data/filtered_data/BPIC 2015/train_data.npy\", train_data_set)\n",
    "np.save(\"../data/filtered_data/BPIC 2015/test_data.npy\", test_data_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   case:concept:name    concept:name time:timestamp_short  org:resource  \\\n",
      "0            4314099    01_HOOFD_010  2011-12-19 00:00:00        560872   \n",
      "1            4314099    01_HOOFD_011  2011-12-22 13:09:24        560872   \n",
      "2            4314099    01_HOOFD_015  2011-12-22 13:09:24        560872   \n",
      "3            4314099    01_HOOFD_020  2011-12-22 13:09:28        560872   \n",
      "4            4314099  01_HOOFD_030_1  2011-12-22 13:13:15        560872   \n",
      "\n",
      "   monitoringResource case:caseStatus      case:last_phase  \\\n",
      "0              560872               G  Vergunning verleend   \n",
      "1             2670601               G  Vergunning verleend   \n",
      "2              560464               G  Vergunning verleend   \n",
      "3             2670601               G  Vergunning verleend   \n",
      "4             2670601               G  Vergunning verleend   \n",
      "\n",
      "   case:requestComplete  time_spent  total_time_pred  total_time_true  \n",
      "0                  True    0.000000        60.168709            107.0  \n",
      "1                  True    3.548194        41.345912            107.0  \n",
      "2                  True    3.548194        49.026351            107.0  \n",
      "3                  True    3.548241        46.709152            107.0  \n",
      "4                  True    3.550868        44.671306            107.0  \n"
     ]
    }
   ],
   "source": [
    "data_path = \"../data/drl_data/bpic2015/bpic2015_w_pred.csv\"\n",
    "\n",
    "df = pd.read_csv(data_path)\n",
    "print(df.head(5))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1192 1192\n",
      "5 101 0.3567013888888889 1486.0\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAr8AAAF1CAYAAADhgoKhAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAhEElEQVR4nO3dfbRdd13n8feHptAWRBp7E0NLuaCdDtW12mIWU61L0VCotDb9Y8oqWiZKNbMcH8CBwQAuEZc6URkE1/gwmYJGeZAIdBopOtRAFzJiJS0UCgHTQmgDIQmlQHkYpO13/jg75XonN/ece8+559zze7/Wuuvsp3P2d99z8ruf/M5v752qQpIkSWrBI8ZdgCRJkrRSDL+SJElqhuFXkiRJzTD8SpIkqRmGX0mSJDXD8CtJkqRmGH61qiQ5kOQZY9jvbJJKsmal9y1JOr4kf5bkN8ddh1YXw68WNa7AOU4tHrMkge2fpp/hV8tmb6gktcM2X6ud4VcnlOQvgLOBv07ylSQvmTME4NokdwPv7rb9qySfS/KlJO9N8j1zXufUJP8tyae79e9Lcmq37qIk/5Dki0luT/L0Pmt7RJJtSe5Kcm+SXUnWduuO1bglyd1JPp/k5fPq2ZnkviT7uuM6uNAxz9ntTx7v9SRpGvTb5id5+rE2c85zH+4xPlH7fJx97kty+Zz5NV0b+9RufsG/LfNe56eSvG/eskry3d30o5K8qmvDDyf5kzl/h85I8o7u79AXkvx9EjPSlPKN1QlV1fOAu4Efr6rHVNXvzln9w8BTgGd1838DnAOsA24D3jhn21cB3wf8ALAWeAnwUJIzgRuB3+yWvxh4W5KZPsr7JeDKro7HA/cBfzhvmx8EzgU2Ab+W5Cnd8lcAs8CTgUuAa/o85oVeT5JWvQHb/BPpp30+5s3Ac+fMPwv4fFXd1s2f6G/LIH4H+DfABcB3A2cCv9atexFwEJgB1gMvA2qJ+9GEM/xqOX69qr5aVV8HqKrXV9X9VfUN4NeB85N8e/e/5+cDL6iqz1TVg1X1D9121wDvrKp3VtVDVXUTsBd4dh/7/4/Ay6vq4Jx9/vt5X8m9sqq+XlW3A7cD53fLnwP8dlXdV1UHgT/o85gXej1Jmnb/qs1fRD/t8zFvAq5Iclo3/xPdMmDhvy2DFJ4kwM8Cv1xVX6iq+4HfBq7uNvkmsAF4YlV9s6r+vqoMv1PKcTtajnuOTSQ5Cfgt4Cp6/3N+qFt1BvAo4BTgruO8xhOBq5L8+JxlJwPv6WP/TwSuT/LQnGUP0vtf+zGfmzP9NeAx3fTj59Y/b/pEFno9SZp2/baTcOL2+TNzN6yqO5PsA348yV8DVwAXwqJ/W740QD0zwGnArb0cDECAk7rp36MXrN/Vrd9RVdsHeH2tIoZf9WOh//3OXf4TwGbgGcAB4Nvpfc0V4PPA/wW+i15v6Vz3AH9RVT+7hLruAZ5fVf9n/ooks4s89xBwFvCxbv4J89b7P35Jreqnzf8qvTAJPBxS5w5XW7B9XsCxoQ+PAD5WVXd2y0/0t2W++TV955x1nwe+DnxPVX1m/hO7nuAXAS/qxhS/J8kHqmpPn/VrFXHYg/pxmN7Y2BP5NuAbwL30Gp/fPraiqh4CXg+8Osnjk5yU5PuTPAp4A73/7T+rW35KdyLFWX3U9SfAbyV5IkCSmSSb+zymXcBLk5zejTv+hXnr+zlmSZpG/bR//wyckuSyJCcDv0rvW75jBm2f/xJ4JvBzzBnywAn+thzH7cD3JLkgySn0enKBh/8O/U/g95Os62o6M8mzuunLk3x3Nzziy/R6qR9c5HegVcrwq378V+BXu7NgX7zANn8OfJre11kfA/5x3voXAx8BPgB8gd6JB4+oqnvo/a/+ZcBRer0F/4X+PpuvBXbT+5rq/m6f/67PY/oNeic3fAr4O+Ct9BrYY/o5ZkmaRou2f1X1JeA/AdfRa/e/Sq9NPWag9rmqDgHvp3dS9FvmrFrsb8vc1/hnem373wH7gffN2+RXgDuBf0zy5W67c7t153TzX+nq+KOqunmhfWl1i+O5JUjyc8DVVfXD465FkiSNjj2/alKSDUku7q5FeS69sV7Xj7suSZI0Wp7wplY9EvgfwJOAL9Ibb/ZH4yxIkiSNnsMeJEmS1AyHPUiSJKkZhl9JkiQ1o68xv0l+GfgZehe4/gjw0/Sut/cWYJbehaefU1X3neh1zjjjjJqdnV16tZI0BLfeeuvnq2pm8S01KNt5SZNiobZ+0fDb3QDgl4DzqurrSXbRuxf2ecCeqtqeZBuwjd419BY0OzvL3r17l3QAkjQsST497hqmle28pEmxUFvf77CHNcCpSdbQ6/H9LL0bE+zs1u8ErlxmjZIkSdJILRp+u3tgvwq4GzgEfKmq3gWs7+7IcuzOLOuO9/wkW5PsTbL36NGjw6tckiRJGtCi4TfJ6fR6eZ8EPB54dJJr+t1BVe2oqo1VtXFmxiF2kiRJGp9+hj08A/hUVR2tqm8Cb6d37+3DSTZA725ZwJHRlSlJkiQtXz/h927goiSnJQmwCdgH7Aa2dNtsAW4YTYmSJEnScCx6tYequiXJW4HbgAeADwI7gMcAu5JcSy8gXzXKQiVJkqTl6us6v1X1CuAV8xZ/g14vsCRJkrQqeIc3SZIkNcPwK0mSpGYYfiVJktQMw68kSZKaYfiVJElSMwy/kiRJaobhV8s2u+3Gh38kaRRsXyQNi+FXkiRJzTD8SpIkqRmGX0mSJDXD8CtJkqRmGH4lSZLUDMOvJEmSmmH4lSRJUjMMv5IkSWqG4VeSJEnNMPxKkiSpGYZfSZIkNWPNuAvQ6jG77caHpw9sv2yMlUiSJC2NPb+SJElqhj2/kiSSHADuBx4EHqiqjUnWAm8BZoEDwHOq6r5x1ShJw2DPryTpmB+pqguqamM3vw3YU1XnAHu6eUla1Qy/kqSFbAZ2dtM7gSvHV4okDYfhV5IEUMC7ktyaZGu3bH1VHQLoHteNrTpJGhLH/EqSAC6uqs8mWQfclOTj/T6xC8tbAc4+++xR1SdJQ2HPrySJqvps93gEuB54GnA4yQaA7vHIAs/dUVUbq2rjzMzMSpUsSUtiz69OaO61fSVNpySPBh5RVfd3088EfgPYDWwBtnePN4yvSkkaDsOvJGk9cH0S6P1deFNV/W2SDwC7klwL3A1cNcYaJWkoDL+S1Liq+iRw/nGW3wtsWvmKJGl0HPMrSZKkZhh+JUmS1IxFw2+Sc5N8aM7Pl5O8MMnaJDcl2d89nr4SBUuSJElLtWj4rapPdLe7vAD4PuBr9C6D420vJUmStKoMOuxhE3BXVX0ab3spSZKkVWbQ8Hs18OZuuq/bXibZmmRvkr1Hjx5deqWSJEnSMvUdfpM8ErgC+KtBduCdfyRJkjQpBun5/THgtqo63M33ddtLSZIkaVIMEn6fy7eGPMC3bnsJ3vZSkiRJq0Bf4TfJacAlwNvnLN4OXJJkf7du+/DLkyRJkoanr9sbV9XXgO+Yt8zbXkqSJGlV8Q5vkiRJaobhV5IkSc0w/EqSJKkZhl9JkiQ1w/ArSZKkZhh+JUmS1AzDryRJkpph+JUkSVIzDL+SJElqhuFXkiRJzTD8SpIkqRmGX0mSJDXD8CtJkqRmGH4lSavC7LYbx12CpClg+JUkSVIzDL+SJElqhuFXkiRJzTD8SpIkqRmGX0mSJDXD8CtJkqRmGH4lSZLUDMOvJEmSmmH4lSRJUjMMv5IkSWqG4VeSJEnNMPxKkiSpGYZfSZIkNcPwK0mSpGYYfiVJktQMw68kSZKaYfiVJElSM9b0s1GSxwHXAd8LFPB84BPAW4BZ4ADwnKq6bxRFavWY3Xbjw9MHtl82xkokSZL+f/32/L4W+Nuq+rfA+cA+YBuwp6rOAfZ085IkSdLEWjT8Jnks8EPA6wCq6l+q6ovAZmBnt9lO4MrRlChJkiQNRz89v08GjgJ/muSDSa5L8mhgfVUdAuge142wTkmSJGnZ+gm/a4CnAn9cVRcCX2WAIQ5JtibZm2Tv0aNHl1imJEmStHz9hN+DwMGquqWbfyu9MHw4yQaA7vHI8Z5cVTuqamNVbZyZmRlGzZIkSdKSLBp+q+pzwD1Jzu0WbQI+BuwGtnTLtgA3jKRCSZIkaUj6utQZ8IvAG5M8Evgk8NP0gvOuJNcCdwNXjaZESZIkaTj6Cr9V9SFg43FWbRpqNZKksUlyErAX+ExVXZ5kLV7PXdKU8Q5vkqRjXkDvOu7HeD13SVPH8CtJIslZwGX07uZ5jNdzlzR1DL+SJIDXAC8BHpqzrK/ruXtJS0mrieFXkhqX5HLgSFXdupTne0lLSatJv1d7kCRNr4uBK5I8GzgFeGySN9Bdz72qDp3oeu6StJrY8ytJjauql1bVWVU1C1wNvLuqrsHruUuaQoZfSdJCtgOXJNkPXNLNS9Kq5rAHSdLDqupm4OZu+l68nrukKWPPryRJkpph+JUkSVIzDL+SJElqhuFXkiRJzTD8SpIkqRmGX0mSJDXD8CtJkqRmGH4lSZLUDMOvJEmSmmH4lSRJUjMMv5IkSWrGmnEXoDbMbrvx4ekD2y+bmNeSJEltsedXkiRJzTD8SpIkqRmGX0mSJDXD8CtJkqRmGH4lSZLUDMOvJEmSmmH4lSRJUjMMv5IkSWqG4VeSJEnNMPxKkiSpGYZfSZIkNcPwK0mSpGas6WejJAeA+4EHgQeqamOStcBbgFngAPCcqrpvNGVKkiRJyzdIz++PVNUFVbWxm98G7Kmqc4A93bwkSZI0sZYz7GEzsLOb3glcuexqJEmSpBHqN/wW8K4ktybZ2i1bX1WHALrHdcd7YpKtSfYm2Xv06NHlVyxJkiQtUV9jfoGLq+qzSdYBNyX5eL87qKodwA6AjRs31hJqlCRJkoair57fqvps93gEuB54GnA4yQaA7vHIqIqUJEmShmHR8Jvk0Um+7dg08EzgDmA3sKXbbAtww6iKlCRJkoahn2EP64Hrkxzb/k1V9bdJPgDsSnItcDdw1ejKlCRJkpZv0fBbVZ8Ezj/O8nuBTaMoSpIkSRoF7/AmSZKkZhh+JUmS1AzDryRJkpph+JUkSVIzDL+SJElqhuFXkiRJzTD8SpIkqRmGX0mSJDXD8CtJkqRmGH4lSZLUDMOvJEmSmmH4lSRJUjMMv5IkSWqG4VeSJEnNMPxKkiSpGYZfSZIkNcPwK0mSpGYYfiWpcUlOSfJPSW5P8tEkr+yWr01yU5L93ePp465VkpbL8CtJ+gbwo1V1PnABcGmSi4BtwJ6qOgfY081L0qpm+JWkxlXPV7rZk7ufAjYDO7vlO4ErV746SRouw68kiSQnJfkQcAS4qapuAdZX1SGA7nHdAs/dmmRvkr1Hjx5dsZolaSkMv5IkqurBqroAOAt4WpLvHeC5O6pqY1VtnJmZGVmNkjQMhl9J0sOq6ovAzcClwOEkGwC6xyPjq0yShsPwK0mNSzKT5HHd9KnAM4CPA7uBLd1mW4AbxlKgJA3RmnEXIEkauw3AziQn0esU2VVV70jyfmBXkmuBu4GrxlmkJA2D4VeSGldVHwYuPM7ye4FNK1+RJI2Owx4kSZLUDMOvJEmSmmH4lSRJUjMMv5IkSWqG4VeSJEnN6Dv8dre+/GCSd3Tza5PclGR/93j66MqUJEmSlm+Qnt8XAPvmzG8D9lTVOcCebl6SJEmaWH2F3yRnAZcB181ZvBnY2U3vBK4camWSJEnSkPXb8/sa4CXAQ3OWra+qQwDd47rjPTHJ1iR7k+w9evTocmqVJEmSlmXR8JvkcuBIVd26lB1U1Y6q2lhVG2dmZpbyEpIkSdJQ9HN744uBK5I8GzgFeGySNwCHk2yoqkNJNgBHRlmoJEmStFyL9vxW1Uur6qyqmgWuBt5dVdcAu4Et3WZbgBtGVqUkSZI0BMu5zu924JIk+4FLunlJkiRpYvUz7OFhVXUzcHM3fS+wafglSZIkSaPhHd4kSZLUDMOvJEmSmmH4lSRJUjMMvwJgdtuND/9I0iSwPZI0CoZfSZIkNcPwK0mSpGYYfiVJktQMw68kSZKaYfiVJElSMwy/kiRJaobhV5IkSc1YM+4CNL28RqckSZo09vxKkiSpGYZfSZIkNcPwK0mSpGYYfiVJE83zByQNk+FXkiRJzTD8SpIkqRmGX0mSJDXD8CtJkqRmGH4lSZLUDMOvJEmSmmH4lSRJUjMMv5IkSWqG4VeSJEnNMPxKkiSpGYZfSZIkNcPwK0maKLPbblzSOknqh+FXkiRJzTD8SpIkqRmGX0mSJDVj0fCb5JQk/5Tk9iQfTfLKbvnaJDcl2d89nj76ciVJkqSl66fn9xvAj1bV+cAFwKVJLgK2AXuq6hxgTzcvSZIkTaxFw2/1fKWbPbn7KWAzsLNbvhO4chQFSpIkScPS15jfJCcl+RBwBLipqm4B1lfVIYDucd3IqpQkSZKGYE0/G1XVg8AFSR4HXJ/ke/vdQZKtwFaAs88+eyk1Dt3c60Qe2H7ZGCuRpPFL8gTgz4HvBB4CdlTVa5OsBd4CzAIHgOdU1X3jqlOShmGgqz1U1ReBm4FLgcNJNgB0j0cWeM6OqtpYVRtnZmaWV60kaRQeAF5UVU8BLgJ+Psl5eG6HpCnUz9UeZroeX5KcCjwD+DiwG9jSbbYFuGFENUqSRqiqDlXVbd30/cA+4Ew8t0PSFOpn2MMGYGeSk+iF5V1V9Y4k7wd2JbkWuBu4aoR1SpJWQJJZ4ELg/zu3I4nndkha9RYNv1X1YXoN4fzl9wKbRlGUpptjrqXJlOQxwNuAF1bVl5P0+7yJO7dDkhbiHd4kSSQ5mV7wfWNVvb1b7LkdkqaO4VeSGpdeF+/rgH1V9eo5qzy3Q9LU6etSZ5KkqXYx8DzgI9013QFeBmzHczskTRnDr5Zk7rjdldjHoGODHVcs9a+q3gcsNMDXczskTRWHPUiSJKkZhl9JkiQ1w/ArSZKkZjjmtzGOhZUkSS2z51eSJEnNMPxKkiSpGYZfSdKqtBKXXJQ0fRzzq1XNMcySJGkQ9vxKkiSpGYZfSZIkNcPwK0mSpGYYfiVJktQMw68kSZKaYfiVJElSMwy/kiRJaobX+dWq4MXsJUnSMNjzK0mSpGYYfiVJktQMw68kadVySJSkQTnmV2M19w/Xge2XjbESSZLUAnt+JUmS1AzDryRJkpph+JUkSVIzDL+SJElqhuFXkjR2XrVB0kox/EqSJKkZhl9JkiQ1w/ArSZKkZiwafpM8Icl7kuxL8tEkL+iWr01yU5L93ePpoy9XkiRJWrp+en4fAF5UVU8BLgJ+Psl5wDZgT1WdA+zp5iVJkqSJtWj4rapDVXVbN30/sA84E9gM7Ow22wlcOaIaJUnqi1eNkLSYNYNsnGQWuBC4BVhfVYegF5CTrFvgOVuBrQBnn332soodl7mN6YHtl42xEkmSJC1H3ye8JXkM8DbghVX15X6fV1U7qmpjVW2cmZlZSo2SJEnSUPQVfpOcTC/4vrGq3t4tPpxkQ7d+A3BkNCVKkiRJw9HP1R4CvA7YV1WvnrNqN7Clm94C3DD88iRJkqTh6WfM78XA84CPJPlQt+xlwHZgV5JrgbuBq0ZSoTREjt+WJKlti4bfqnofkAVWbxpuOZIkSdLoeIc3SZIkNcPwK0mSpGYMdJ1faZS8OL0kSRo1e34lSZLUDMOvJGniDPpNkN8cSeqX4VeSJEnNcMxvw1rpKVlOD5LXApYkabrY8ytJkqRmGH4lSZLUDMOvJEmSmuGY3wa0MrZXkiRpMfb8SpIkqRmGX0mSJDXD8CtJIsnrkxxJcsecZWuT3JRkf/d4+jhrlKRhMPxKkgD+DLh03rJtwJ6qOgfY081L0qpm+JUkUVXvBb4wb/FmYGc3vRO4ciVrkqRRMPxKkhayvqoOAXSP6463UZKtSfYm2Xv06NFl7bDfq9N4FRtJS2X4lSQtS1XtqKqNVbVxZmZm3OVI0glN9XV+5/YMHNh+2Rgr0bTxs6VGHE6yoaoOJdkAHBl3QZK0XPb8SpIWshvY0k1vAW4YYy2SNBSGX0kSSd4MvB84N8nBJNcC24FLkuwHLunmJWlVm+phD5Kk/lTVcxdYtWlFC5GkEZv48DussZWjPjPYMaCSJEmTz2EPkiRJaobhV5IkSc0w/EqSJKkZEz/mVyuvlTsnOU5bkqT22PMrSZKkZhh+JUkTq5VvoiStHMOvJEmSmuGY31XC8amLs4dIkiQtZtGe3ySvT3IkyR1zlq1NclOS/d3j6aMtU5IkSVq+foY9/Blw6bxl24A9VXUOsKeblyRJkibaouG3qt4LfGHe4s3Azm56J3DlcMuSJEmShm+pY37XV9UhgKo6lGTdQhsm2QpsBTj77LOXuLueSRj3OqwaJuFY9C2rcbywnyFJkgY38qs9VNWOqtpYVRtnZmZGvTtJkiRpQUsNv4eTbADoHo8MryRJkiRpNJYafncDW7rpLcANwylHkiRJGp1Fx/wmeTPwdOCMJAeBVwDbgV1JrgXuBq4aZZGjtNrHTa72+leT1TguWJIk/WuLht+qeu4CqzYNuRZJkiRppLy9sSRJkpph+JUkSVIzDL+SJElqxlJvcqEh8sYZms/3UpKk0bDnV5IkSc0w/EqSVpX5lx080WUIj63zUoWSjjH8SpIkqRmO+V2GfnoSBu1tcKzn6rbQ++1YbkmSJoM9v5IkSWqG4VeSJEnNMPxKkiSpGVMx5nfSxkR6VvH08L2UVp/ZbTf2/bdgkG1X8rUkjY49v5IkSWqG4VeSJEnNMPxKkiSpGVMx5ncUJnms5yTXpsWN4lrAkiSpP/b8SpIkqRmGX0nSVJj/rcpSviXzmzVp+hl+JUmS1IypG/Pr/9p7/D1MlmG9Hwu9zkqOI56062pLkjQIe34lSZLUDMOvJEmSmmH4lSRJUjOmbszvckzaONlJq0fHN+rxvMN6Tcf/atIN89/A3NcadKz8IPtY7HN/bB/9bOe/IWll2PMrSZKkZhh+JUmS1AzDryRJkprhmF+pESt5LeCF9uuYRknSuNnzK0mSpGbY8ytJasbxvgGZf0WG421zYPtlfV8dYqF9zH+NE30TstjVH+bW3O+VIuZuN/85i823ZO57NcjvoOXf2Wpjz68kSZKasaye3ySXAq8FTgKuq6rtQ6lK0lgs57qnoxjbu9BrOo54ZdnWS5omS+75TXIS8IfAjwHnAc9Nct6wCpMkjZ9tvaRps5xhD08D7qyqT1bVvwB/CWweTlmSpAlhWy9pqiwn/J4J3DNn/mC3TJI0PWzrJU2VVNXSnphcBTyrqn6mm38e8LSq+sV5220Ftnaz5wKfWHq5E+0M4PPjLmJEPLbVyWNb2BOramZYxUyzftr6IbXzk/R5naRaYLLqmaRawHpOZJJqgfHUc9y2fjknvB0EnjBn/izgs/M3qqodwI5l7GdVSLK3qjaOu45R8NhWJ49NQ7JoWz+Mdn6S3tNJqgUmq55JqgWs50QmqRaYrHqWM+zhA8A5SZ6U5JHA1cDu4ZQlSZoQtvWSpsqSe36r6oEkvwD8b3qXv3l9VX10aJVJksbOtl7StFnWdX6r6p3AO4dUy2o3zUM7PLbVyWPTUKxQWz9J7+kk1QKTVc8k1QLWcyKTVAtMUD1LPuFNkiRJWm28vbEkSZKaYfgdUJInJHlPkn1JPprkBd3ytUluSrK/ezx93LUuVZKTknwwyTu6+ak4tiSPS/LWJB/v3r/vn6Jj++Xu83hHkjcnOWU1H1uS1yc5kuSOOcsWPJ4kL01yZ5JPJHnWeKrWUiS5tHvf7kyybQX2N3AbvhKfr0Ha3VHXM2hbOcp6Bm3bhl3LsNqiJN+X5CPduj9IkiHW83vde/XhJNcnedxK1HO8Wuase3GSSnLGStQysKryZ4AfYAPw1G7624B/pnfLz98FtnXLtwG/M+5al3GM/xl4E/CObn4qjg3YCfxMN/1I4HHTcGz0bjjwKeDUbn4X8FOr+diAHwKeCtwxZ9lxj6f793c78CjgScBdwEnjPgZ/+nqfT+reryd3/yZvB84b8T4HasNX6vPVb7u7EvUM0laOsp5B27ZR1DKstgj4J+D7gQB/A/zYEOt5JrCmm/6dlarneLV0y59A7wTZTwNnrNTvZpAfe34HVFWHquq2bvp+YB+9f6Cb6TUYdI9XjqXAZUpyFnAZcN2cxav+2JI8lt4/1NcBVNW/VNUXmYJj66wBTk2yBjiN3nVYV+2xVdV7gS/MW7zQ8WwG/rKqvlFVnwLupHdLXk2+Fb918hLa8JF/vgZsd0dazxLaylH/fgZp24ZeyzDaoiQbgMdW1furl/b+nCW2x8erp6reVVUPdLP/SO9a3COvZ4HfDcDvAy8B5p5UNvLfzSAMv8uQZBa4ELgFWF9Vh6DXuALrxljacryG3of2oTnLpuHYngwcBf60+2rxuiSPZgqOrao+A7wKuBs4BHypqt7FFBzbPAsdj7ffXb3G+t712YavRI2vof92d9T1DNpWjqyeJbRtK/V5GnT/Z3bTo64L4Pn0ek/HUk+SK4DPVNXt81ZNwu/mYYbfJUryGOBtwAur6svjrmcYklwOHKmqW8ddywisoff1zB9X1YXAV+l9XbXqdePNNtP7KunxwKOTXDPeqlbU8caHeRmb1WFs790AbfhIa1xCuzvq39mgbeXI6llC2zbutmCh/a9IXUleDjwAvHEc9SQ5DXg58GvHW72StSzG8LsESU6m12i+sare3i0+3HXf0z0eGVd9y3AxcEWSA/S+fvzRJG9gOo7tIHCwqm7p5t9Kr4GfhmN7BvCpqjpaVd8E3g78ANNxbHMtdDx93WpdE2ks792Abfioaxy03R11PYO2laOsZ9C2baU+T4Pu/yDfGoowkrqSbAEuB36yGz4wjnq+i95/VG7vPs9nAbcl+c4x1HJCht8BdWchvg7YV1WvnrNqN7Clm94C3LDStS1XVb20qs6qqll6tzB9d1Vdw3Qc2+eAe5Kc2y3aBHyMKTg2el8JXpTktO7zuYneOMZpOLa5Fjqe3cDVSR6V5EnAOfROoNDkW/FbJy+hDR/p52sJ7e6o6xm0rRxlPYO2bSvVFgy0/25oxP1JLuqO4z8wxPY4yaXArwBXVNXX5tW5YvVU1Ueqal1VzXaf54P0Ti793ErX0k+x/gx2duMP0uuS/zDwoe7n2cB3AHuA/d3j2nHXuszjfDrfOut4Ko4NuADY2713/ws4fYqO7ZXAx4E7gL+gd0btqj024M30xvh9k14Deu2JjofeV213AZ9gBc4U9meo7/Wz6V1x4S7g5Suwv4Hb8JX6fPXb7o66nkHbylHWM2jbNuxahtUWARu7Y7gL+O90NxkbUj130htPe+zz/CcrUc/xapm3/gDd1R5W4nczyI93eJMkSVIzHPYgSZKkZhh+JUmS1AzDryRJkpph+JUkSVIzDL+SJElqhuFXkiRJzTD8SpIkqRmGX0mSJDXj/wFYHvOEV5LbDwAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 864x432 with 2 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "display_dataset(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "523\n"
     ]
    }
   ],
   "source": [
    "filtered_cids = []\n",
    "for cid in list(set(df[\"case:concept:name\"].values)):\n",
    "    this_df = df[df[\"case:concept:name\"]==cid]\n",
    "    true_value = this_df.iloc[0]['total_time_true']\n",
    "    if true_value >= 30 and true_value <= 100:\n",
    "        if this_df.shape[0] >= 15 and this_df.shape[0] <= 50:\n",
    "            filtered_cids.append(cid)\n",
    "print(len(filtered_cids))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "200 50\n",
      "(9907, 11)\n"
     ]
    }
   ],
   "source": [
    "df_samples, train_cids, test_cids = sample_data(df, filtered_cids, 200, 50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from config import DeepReinforceLearningParameters\n",
    "\n",
    "save_path = DeepReinforceLearningParameters(dataset_name=\"bpic2015\").DATA_PATH\n",
    "\n",
    "df_samples.to_csv(save_path+\"/bpic2015_sample250.csv\", index=False)\n",
    "np.save(save_path+\"/train_case_ids.npy\", np.array(train_cids))\n",
    "np.save(save_path+\"/test_case_ids.npy\", np.array(test_cids))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### BPIC2012数据集处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "96ae8f72178c4c6eb534a74182b2dcbc",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(HTML(value='parsing log, completed traces :: '), FloatProgress(value=0.0, max=13087.0), HTML(va…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "data_path = \"../data/raw_data/BPIC2012/12689204/BPI_Challenge_2012.xes\"\n",
    "\n",
    "import pm4py\n",
    "\n",
    "log = pm4py.read_xes(data_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pm4py.convert_to_dataframe(log)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>org:resource</th>\n",
       "      <th>lifecycle:transition</th>\n",
       "      <th>concept:name</th>\n",
       "      <th>time:timestamp</th>\n",
       "      <th>case:REG_DATE</th>\n",
       "      <th>case:concept:name</th>\n",
       "      <th>case:AMOUNT_REQ</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>112</td>\n",
       "      <td>COMPLETE</td>\n",
       "      <td>A_SUBMITTED</td>\n",
       "      <td>2011-10-01 00:38:44.546000+02:00</td>\n",
       "      <td>2011-10-01 00:38:44.546000+02:00</td>\n",
       "      <td>173688</td>\n",
       "      <td>20000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>112</td>\n",
       "      <td>COMPLETE</td>\n",
       "      <td>A_PARTLYSUBMITTED</td>\n",
       "      <td>2011-10-01 00:38:44.880000+02:00</td>\n",
       "      <td>2011-10-01 00:38:44.546000+02:00</td>\n",
       "      <td>173688</td>\n",
       "      <td>20000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>112</td>\n",
       "      <td>COMPLETE</td>\n",
       "      <td>A_PREACCEPTED</td>\n",
       "      <td>2011-10-01 00:39:37.906000+02:00</td>\n",
       "      <td>2011-10-01 00:38:44.546000+02:00</td>\n",
       "      <td>173688</td>\n",
       "      <td>20000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>112</td>\n",
       "      <td>SCHEDULE</td>\n",
       "      <td>W_Completeren aanvraag</td>\n",
       "      <td>2011-10-01 00:39:38.875000+02:00</td>\n",
       "      <td>2011-10-01 00:38:44.546000+02:00</td>\n",
       "      <td>173688</td>\n",
       "      <td>20000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NaN</td>\n",
       "      <td>START</td>\n",
       "      <td>W_Completeren aanvraag</td>\n",
       "      <td>2011-10-01 11:36:46.437000+02:00</td>\n",
       "      <td>2011-10-01 00:38:44.546000+02:00</td>\n",
       "      <td>173688</td>\n",
       "      <td>20000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>262195</th>\n",
       "      <td>112</td>\n",
       "      <td>COMPLETE</td>\n",
       "      <td>A_PARTLYSUBMITTED</td>\n",
       "      <td>2012-02-29 23:51:17.423000+01:00</td>\n",
       "      <td>2012-02-29 23:51:16.799000+01:00</td>\n",
       "      <td>214376</td>\n",
       "      <td>15000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>262196</th>\n",
       "      <td>112</td>\n",
       "      <td>SCHEDULE</td>\n",
       "      <td>W_Afhandelen leads</td>\n",
       "      <td>2012-02-29 23:52:01.287000+01:00</td>\n",
       "      <td>2012-02-29 23:51:16.799000+01:00</td>\n",
       "      <td>214376</td>\n",
       "      <td>15000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>262197</th>\n",
       "      <td>11169</td>\n",
       "      <td>START</td>\n",
       "      <td>W_Afhandelen leads</td>\n",
       "      <td>2012-03-01 09:26:46.736000+01:00</td>\n",
       "      <td>2012-02-29 23:51:16.799000+01:00</td>\n",
       "      <td>214376</td>\n",
       "      <td>15000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>262198</th>\n",
       "      <td>11169</td>\n",
       "      <td>COMPLETE</td>\n",
       "      <td>A_DECLINED</td>\n",
       "      <td>2012-03-01 09:27:37.118000+01:00</td>\n",
       "      <td>2012-02-29 23:51:16.799000+01:00</td>\n",
       "      <td>214376</td>\n",
       "      <td>15000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>262199</th>\n",
       "      <td>11169</td>\n",
       "      <td>COMPLETE</td>\n",
       "      <td>W_Afhandelen leads</td>\n",
       "      <td>2012-03-01 09:27:41.325000+01:00</td>\n",
       "      <td>2012-02-29 23:51:16.799000+01:00</td>\n",
       "      <td>214376</td>\n",
       "      <td>15000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>262200 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       org:resource lifecycle:transition            concept:name  \\\n",
       "0               112             COMPLETE             A_SUBMITTED   \n",
       "1               112             COMPLETE       A_PARTLYSUBMITTED   \n",
       "2               112             COMPLETE           A_PREACCEPTED   \n",
       "3               112             SCHEDULE  W_Completeren aanvraag   \n",
       "4               NaN                START  W_Completeren aanvraag   \n",
       "...             ...                  ...                     ...   \n",
       "262195          112             COMPLETE       A_PARTLYSUBMITTED   \n",
       "262196          112             SCHEDULE      W_Afhandelen leads   \n",
       "262197        11169                START      W_Afhandelen leads   \n",
       "262198        11169             COMPLETE              A_DECLINED   \n",
       "262199        11169             COMPLETE      W_Afhandelen leads   \n",
       "\n",
       "                          time:timestamp                     case:REG_DATE  \\\n",
       "0       2011-10-01 00:38:44.546000+02:00  2011-10-01 00:38:44.546000+02:00   \n",
       "1       2011-10-01 00:38:44.880000+02:00  2011-10-01 00:38:44.546000+02:00   \n",
       "2       2011-10-01 00:39:37.906000+02:00  2011-10-01 00:38:44.546000+02:00   \n",
       "3       2011-10-01 00:39:38.875000+02:00  2011-10-01 00:38:44.546000+02:00   \n",
       "4       2011-10-01 11:36:46.437000+02:00  2011-10-01 00:38:44.546000+02:00   \n",
       "...                                  ...                               ...   \n",
       "262195  2012-02-29 23:51:17.423000+01:00  2012-02-29 23:51:16.799000+01:00   \n",
       "262196  2012-02-29 23:52:01.287000+01:00  2012-02-29 23:51:16.799000+01:00   \n",
       "262197  2012-03-01 09:26:46.736000+01:00  2012-02-29 23:51:16.799000+01:00   \n",
       "262198  2012-03-01 09:27:37.118000+01:00  2012-02-29 23:51:16.799000+01:00   \n",
       "262199  2012-03-01 09:27:41.325000+01:00  2012-02-29 23:51:16.799000+01:00   \n",
       "\n",
       "       case:concept:name case:AMOUNT_REQ  \n",
       "0                 173688           20000  \n",
       "1                 173688           20000  \n",
       "2                 173688           20000  \n",
       "3                 173688           20000  \n",
       "4                 173688           20000  \n",
       "...                  ...             ...  \n",
       "262195            214376           15000  \n",
       "262196            214376           15000  \n",
       "262197            214376           15000  \n",
       "262198            214376           15000  \n",
       "262199            214376           15000  \n",
       "\n",
       "[262200 rows x 7 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5000 1889\n",
      "10000 3829\n"
     ]
    }
   ],
   "source": [
    "valid_cnts = []\n",
    "cid_cnt = 0\n",
    "for cid in set(df[\"case:concept:name\"]):\n",
    "    cid_cnt += 1\n",
    "    if cid_cnt % 5000 == 0:\n",
    "        print(cid_cnt, len(valid_cnts))\n",
    "    tmp_df = df[df[\"case:concept:name\"] == cid]\n",
    "    if tmp_df.shape[0] >= 15 and tmp_df.shape[0] <= 50:\n",
    "        valid_cnts.append(cid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 174740\n",
      "100 188407\n",
      "200 204272\n",
      "300 186523\n",
      "400 180181\n",
      "500 182699\n",
      "600 185455\n",
      "700 181532\n",
      "800 186017\n",
      "900 176954\n",
      "1000 194088\n",
      "1100 210272\n",
      "1200 201674\n",
      "1300 196213\n",
      "1400 182771\n",
      "1500 177122\n",
      "1600 194683\n",
      "1700 198182\n",
      "1800 188732\n",
      "1900 175380\n",
      "2000 176708\n",
      "2100 205346\n",
      "2200 176302\n",
      "2300 192310\n",
      "2400 174367\n",
      "2500 205676\n",
      "2600 199210\n",
      "2700 203055\n",
      "2800 181664\n",
      "2900 191524\n",
      "3000 198816\n",
      "3100 176765\n",
      "3200 208535\n",
      "3300 195669\n",
      "3400 206958\n",
      "3500 180196\n",
      "3600 204394\n",
      "3700 201644\n",
      "3800 212875\n",
      "3900 175266\n",
      "4000 178004\n",
      "4100 200131\n",
      "4200 207248\n",
      "4300 206624\n",
      "4400 185725\n",
      "4500 194221\n",
      "4600 206519\n",
      "4700 180454\n",
      "4800 193831\n",
      "4900 191380\n",
      "5000 175801\n",
      "(154546, 7)\n"
     ]
    }
   ],
   "source": [
    "df_filtered = pd.DataFrame()\n",
    "\n",
    "cnt = 0\n",
    "for cid in valid_cnts:\n",
    "    if cnt % 100 == 0:\n",
    "        print(cnt, cid)\n",
    "    cnt += 1\n",
    "    this_df = df[df[\"case:concept:name\"] == cid]\n",
    "    df_filtered = pd.concat([df_filtered, this_df])\n",
    "print(df_filtered.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_f = df_filtered[[\"lifecycle:transition\", \"time:timestamp\", \"concept:name\", \"case:concept:name\", \"case:AMOUNT_REQ\"]]\n",
    "df_f.to_csv(\"../data/filtered_data/BPIC2012/filtered_data.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "2000\n",
      "4000\n",
      "6000\n",
      "8000\n",
      "10000\n",
      "12000\n",
      "14000\n",
      "16000\n",
      "18000\n",
      "20000\n",
      "22000\n",
      "24000\n",
      "26000\n",
      "28000\n",
      "30000\n",
      "32000\n",
      "34000\n",
      "36000\n",
      "38000\n",
      "40000\n",
      "42000\n",
      "44000\n",
      "46000\n",
      "48000\n",
      "50000\n",
      "52000\n",
      "54000\n",
      "56000\n",
      "58000\n",
      "60000\n",
      "62000\n",
      "64000\n",
      "66000\n",
      "68000\n",
      "70000\n",
      "72000\n",
      "74000\n",
      "76000\n",
      "78000\n",
      "80000\n",
      "82000\n",
      "84000\n",
      "86000\n",
      "88000\n",
      "90000\n",
      "92000\n",
      "94000\n",
      "96000\n",
      "98000\n",
      "100000\n",
      "102000\n",
      "104000\n",
      "106000\n",
      "108000\n",
      "110000\n",
      "112000\n",
      "114000\n",
      "116000\n",
      "118000\n",
      "120000\n",
      "122000\n",
      "124000\n",
      "126000\n",
      "128000\n",
      "130000\n",
      "132000\n",
      "134000\n",
      "136000\n",
      "138000\n",
      "140000\n",
      "142000\n",
      "144000\n",
      "146000\n",
      "148000\n",
      "150000\n",
      "152000\n",
      "154000\n",
      "(154546, 5)\n"
     ]
    }
   ],
   "source": [
    "from data_cleaner.data_preprocess import BPIC2012Preprocess\n",
    "\n",
    "df = BPIC2012Preprocess().preprocess()\n",
    "print(df.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "jupyter": {
     "source_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train_data_size:  3509 , test_data_size:  1505\n"
     ]
    }
   ],
   "source": [
    "def prepare_train_data(data_processed):\n",
    "    all_case_ids = set(data_processed['case:concept:name'].values)\n",
    "    num_trainset = len(all_case_ids) * 7 // 10\n",
    "    random.seed(0)\n",
    "    train_cids = random.sample(all_case_ids, num_trainset)\n",
    "    train_cids_set = set(train_cids)\n",
    "    test_cids_set = all_case_ids - train_cids_set\n",
    "    print('train_data_size: ', len(train_cids_set), ', test_data_size: ', len(test_cids_set))\n",
    "\n",
    "    activity_names = np.array(data_processed['concept:name'].values)\n",
    "    activity_names = np.reshape(activity_names, (activity_names.shape[0], 1))\n",
    "    ohe_act = OneHotEncoder(sparse=False)\n",
    "    ohe_act.fit(activity_names)\n",
    "    \n",
    "    life_transitions = np.array(data_processed['lifecycle:transition'].values)\n",
    "    life_transitions = np.reshape(life_transitions, (life_transitions.shape[0], 1))\n",
    "    ohe_lt = OneHotEncoder(sparse=False)\n",
    "    ohe_lt.fit(life_transitions)\n",
    "    \n",
    "    weeks = np.array(list(range(0, 7)))\n",
    "    weeks = np.reshape(weeks, (weeks.shape[0], 1))\n",
    "    ohe_week = OneHotEncoder(sparse=False)\n",
    "    ohe_week.fit(weeks)\n",
    "\n",
    "    def generate_data(cid_set):\n",
    "        data_set = []\n",
    "        trace_length = 0\n",
    "        for cid in cid_set:\n",
    "            thisdf = data_processed[data_processed['case:concept:name'] == cid]\n",
    "            trace_length = max(trace_length, thisdf.shape[0])\n",
    "            tmpdata = []\n",
    "            \n",
    "            start_time = datetime.datetime.strptime(thisdf.iloc[0]['time:timestamp_short'], '%Y-%m-%d %H:%M:%S')\n",
    "            end_time = datetime.datetime.strptime(thisdf.iloc[-1]['time:timestamp_short'], '%Y-%m-%d %H:%M:%S')\n",
    "            last_time = start_time\n",
    "\n",
    "            for i in range(thisdf.shape[0]):\n",
    "                row = [int(thisdf.iloc[i]['case:concept:name'])]\n",
    "                \n",
    "                event_dt = datetime.datetime.strptime(thisdf.iloc[i]['time:timestamp_short'], '%Y-%m-%d %H:%M:%S')\n",
    "                midnight_time = event_dt.replace(hour=0, minute=0, second=0, microsecond=0)\n",
    "                \n",
    "                activity_name = np.array(thisdf.iloc[i]['concept:name'])\n",
    "                activity_name = np.reshape(activity_name, (-1, 1))\n",
    "                \n",
    "                life_transition = np.array(thisdf.iloc[i]['lifecycle:transition'])\n",
    "                life_transition = np.reshape(life_transition, (-1, 1))\n",
    "                weekday = event_dt.weekday()\n",
    "                weekday = np.reshape(np.array([weekday]), (-1, 1))\n",
    "                \n",
    "                row.extend(ohe_act.transform(activity_name).tolist()[0])  # 活动名称one-hot编码\n",
    "                row.extend(ohe_lt.transform(life_transition).tolist()[0])\n",
    "                row.extend(ohe_week.transform(weekday).tolist()[0])\n",
    "                row.append(thisdf.iloc[i]['case:AMOUNT_REQ'])\n",
    "                \n",
    "                row.append((event_dt - start_time).total_seconds()/3600/24)  # 总花费时间\n",
    "                row.append((event_dt - last_time).total_seconds()/3600/24)  # 相比上次活动花费时间\n",
    "                last_time = event_dt\n",
    "                row.append((event_dt - midnight_time).total_seconds()/3600/24)  # 距午夜时间\n",
    "                \n",
    "                row.append((end_time - event_dt).total_seconds()/3600/24)\n",
    "\n",
    "                tmpdata.append(row)\n",
    "                if i != thisdf.shape[0] - 1:\n",
    "                    data_set.append(tmpdata.copy())\n",
    "        return data_set, trace_length\n",
    "\n",
    "    train_data_set, max_train_trace_length = generate_data(train_cids_set)\n",
    "    test_data_set, max_test_trace_length = generate_data(test_cids_set)\n",
    "\n",
    "    min_value = [1e20] * (len(train_data_set[0][0]) - 1)\n",
    "    max_value = [-1] * (len(train_data_set[0][0]) - 1)\n",
    "    for element in train_data_set:\n",
    "        for row in element:\n",
    "            for i in range(1, len(row)-1):\n",
    "                min_value[i] = min(min_value[i], row[i])\n",
    "                max_value[i] = max(max_value[i], row[i])\n",
    "    train_data_new = []\n",
    "    for i in range(len(train_data_set)):\n",
    "        seq = []\n",
    "        for j in range(len(train_data_set[i])):\n",
    "            row = [train_data_set[i][j][0]]\n",
    "            for k in range(1, len(train_data_set[i][j])-1):\n",
    "                if max_value[k] == min_value[k]:\n",
    "                    row.append(train_data_set[i][j][k])\n",
    "                else:\n",
    "                    row.append((train_data_set[i][j][k]-min_value[k]) / (max_value[k]-min_value[k]))\n",
    "            row.append(train_data_set[i][j][-1])\n",
    "            seq.append(row)\n",
    "        train_data_new.append(seq)\n",
    "    test_data_new = []\n",
    "    for i in range(len(test_data_set)):\n",
    "        seq = []\n",
    "        for j in range(len(test_data_set[i])):\n",
    "            row = [test_data_set[i][j][0]]\n",
    "            for k in range(1, len(test_data_set[i][j])-1):\n",
    "                if max_value[k] == min_value[k]:\n",
    "                    row.append(test_data_set[i][j][k])\n",
    "                else:\n",
    "                    row.append((test_data_set[i][j][k]-min_value[k]) / (max_value[k]-min_value[k]))\n",
    "            row.append(test_data_set[i][j][-1])\n",
    "            seq.append(row)\n",
    "        test_data_new.append(seq)\n",
    "\n",
    "    train_data_set = sequence.pad_sequences(train_data_new, maxlen=20, dtype='float64')\n",
    "    test_data_set = sequence.pad_sequences(test_data_new, maxlen=20, dtype='float64')\n",
    "    return train_data_set, test_data_set\n",
    "\n",
    "train_data_set, test_data_set = prepare_train_data(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.save(\"../data/filtered_data/BPIC2012/train_data.npy\", train_data_set)\n",
    "np.save(\"../data/filtered_data/BPIC2012/test_data.npy\", test_data_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  lifecycle:transition time:timestamp_short        concept:name  \\\n",
      "0             COMPLETE  2011-12-29 23:57:07         A_SUBMITTED   \n",
      "1             COMPLETE  2011-12-29 23:57:07   A_PARTLYSUBMITTED   \n",
      "2             SCHEDULE  2011-12-29 23:57:52  W_Afhandelen leads   \n",
      "3                START  2011-12-30 09:01:28  W_Afhandelen leads   \n",
      "4             COMPLETE  2011-12-30 09:02:16       A_PREACCEPTED   \n",
      "\n",
      "   case:concept:name  case:AMOUNT_REQ  time_spent  total_time_pred  \\\n",
      "0             196620            10000    0.000000        10.664392   \n",
      "1             196620            10000    0.000000        10.706963   \n",
      "2             196620            10000    0.000521        10.894831   \n",
      "3             196620            10000    0.378021         9.983064   \n",
      "4             196620            10000    0.378576        10.023651   \n",
      "\n",
      "   total_time_true  \n",
      "0         1.547963  \n",
      "1         1.547963  \n",
      "2         1.547963  \n",
      "3         1.547963  \n",
      "4         1.547963  \n"
     ]
    }
   ],
   "source": [
    "data_path = \"../data/drl_data/bpic2012/bpic2012_wp_nn.csv\"\n",
    "\n",
    "df = pd.read_csv(data_path)\n",
    "print(df.head(5))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### case 1: 抽样case id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5014 5014\n",
      "15 50 0.00587962962962963 63.054212962962964\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAsYAAAF1CAYAAADr3izzAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAfEElEQVR4nO3df7BkZX3n8fdHIGiQjbAM7PBDL1FEIbUMZoo1S8pgUCGMCm4tFmS1JhU2Y2Vhg1u4etGUkq2MO7vrj2Rr1eyoLBOj4KzKMnFYIxItwsaIAwEcGJFRRhgYZy7iD3RdIsN3/+hztb1239v33u7b3Xfer6qu7n7O6XO+fW7Pcz5z+ulzUlVIkiRJB7qnDbsASZIkaRQYjCVJkiQMxpIkSRJgMJYkSZIAg7EkSZIEGIwlSZIkwGCsZSLJriQvG8J6J5JUkoOXet2SpM6SXJPkj4ddh8aPwVgLNqwwOkwH4nuWJLD/04HBYKyB8SiqJB047PO1HBiMtSBJPgI8G/jLJD9I8ua2YQWXJHkQ+Otm3v+Z5FtJvpfkliSnti3nGUneneSbzfRbkzyjmfbiJH+b5LtJ7kpyVo+1PS3JZJKvJ/l2ks1JjmymTde4NsmDSR5N8rYZ9WxK8p0kO5r3tbvbe25b7b/qtDxJWg567fOTnDXdZ7a99idHmmfrnzusc0eSV7Y9P7jpY1/UPO+6b5mxnN9JcuuMtkryvObxoUne1fThe5P8Wdt+6Kgkn272Q48l+ZskZqdlzD+uFqSqXg88CLyqqp5ZVf+5bfJvAC8Ezmme/2/gJOBo4A7go23zvgv4VeCfA0cCbwaeSnIcsBX446b9TcAnk6zoobw/AC5o6jgW+A7wvhnz/DpwMnA28PYkL2za3wFMAL8MvBx4XY/vudvyJGnszbPPn00v/fO0a4GL256fAzxaVXc0z2fbt8zHfwKeD6wCngccB7y9mXYFsBtYARwDvBWoBa5HY8BgrEG4qqp+WFU/Aqiqq6vq8ap6ArgKOC3JLzX/6/5d4PKqeriq9lfV3zbzvQ64sapurKqnquomYBtwXg/rfwPwtqra3bbOfznja74/qqofVdVdwF3AaU37a4F3VtV3qmo38F97fM/dlidJy93P9Plz6KV/nvYx4NVJfrF5/ttNG9B93zKfwpME+D3g31XVY1X1OPBO4KJmlh8DK4HnVNWPq+pvqspgvIw5HkiD8ND0gyQHAeuBC2n9j/upZtJRwKHA04Gvd1jGc4ALk7yqre0Q4PM9rP85wPVJnmpr20/rf/vTvtX2+P8Cz2weH9te/4zHs+m2PEla7nrtJ2H2/vnh9hmrameSHcCrkvwl8GrgdJhz3/K9edSzAvhF4PZWRgYgwEHN4/9CK3R/tpm+sao2zGP5GjMGYy1Gt/81t7f/NnA+8DJgF/BLtL46C/Ao8P+A59I6ytruIeAjVfV7C6jrIeB3q+r/zJyQZGKO1+4BjgfubZ6fMGO6RwokHah66fN/SCtoAj8JsO1D4Lr2z11MD6d4GnBvVe1s2mfbt8w0s6Z/0jbtUeBHwKlV9fDMFzZHkK8ArmjGMH8+yZer6uYe69eYcSiFFmMvrbG4szkceAL4Nq2O6Z3TE6rqKeBq4D1Jjk1yUJJfS3Io8Be0jhKc07Q/vflRx/E91PVnwPokzwFIsiLJ+T2+p83AlUmOaMY5XzZjei/vWZKWo176v68BT0+yJskhwB/S+nZw2nz75+uAVwC/T9swCmbZt3RwF3BqklVJnk7rCDDwk/3QB4H3Jjm6qem4JOc0j1+Z5HnNkIvv0zq6vX+ObaAxZjDWYvxH4A+bX+u+qcs8fw58k9ZXZPcCfzdj+puArwBfBh6j9SOIp1XVQ7SOBrwVmKJ1lOHf09tn9k+BLbS++nq8Wec/6/E9/QdaP7R4APgc8Alane+0Xt6zJC1Hc/Z/VfU94N8AH6LV7/+QVp86bV79c1XtAb5I6wfaH2+bNNe+pX0ZX6PVt38OuB+4dcYsbwF2An+X5PvNfCc3005qnv+gqeP9VfWFbuvS+ItjyKXukvw+cFFV/cawa5EkSYPlEWOpTZKVSc5szrV5Mq2xZdcPuy5JkjR4/vhO+lm/APx34ETgu7TGt71/mAVJkqSlMedQimag+i20Bs8fDHyiqt7RXKnm47QuhrALeG1Vfad5zZXAJbQGqP9BVf3VoN6AJEmS1A+9BOMAh1XVD5pfmN4KXA78C+CxqtqQZBI4oqrekuQUWqdXOYPWOWE/Bzy/qvwVpyRJkkbWnGOMq+UHzdNDmlvROmPApqZ9E61LPNK0X1dVT1TVA7R+6XlGP4uWJEmS+q2nMcbNCbpvp3UN8fdV1ZeSHNOcRoWq2jN9/j9a1xhvP23K7qZt5jLXAesADjvssF99wQtesPB3IUl9cPvttz9aVSvmnlMLcdRRR9XExMSwy5B0gJutr+8pGDfDIFYleRatSzn+yiyzd7rqzM+N16iqjcBGgNWrV9e2bdt6KUWSBibJN4ddw3I2MTGBfb2kYZutr5/X6dqq6rvAF4Bzgb1JVjYrWAnsa2bbzc9eRvd44JH5rEeSJElaanMG4+Zyjc9qHj+D1nXJv0rryjVrm9nWAjc0j7cAFyU5NMmJtK4ac1uf65YkSZL6qpehFCuBTc0446cBm6vq00m+CGxOcgnwIHAhQFXdk2QzrUs0Pglc6hkpJEmSNOrmDMZVdTdweof2bwNnd3nNemD9oquTJEmSloiXhJYkSZIwGEuSJEmAwViSJEkCDMaSJEkSYDCWJEmSAIOxJEmSBBiMJUmSJMBgLEmSJAEGY0mSJAno7ZLQGnETk1u7Ttu1Yc0SViJJkjS+PGIsSZIkYTCWJEmSAIOxJEmSBBiMJUmSJMBgLEmSJAEGY0mSJAkwGEuSJEmAwViSJEkCDMaSJEkSYDCWJEmSAIOxJEmSBBiMJUmSJMBgLEmSJAEGY0mSJAkwGEuSJEmAwViSJEkCDMaSJEkSYDCWJEmSAIOxJEmSBBiMJUmSJMBgLEmSJAEGY0mSJAkwGEuSJEkAHDzsAjQ8E5NbZ52+a8OaJapEkiRp+DxiLEnqKskJST6fZEeSe5Jc3rQfmeSmJPc390cMu1ZJWiyDsSRpNk8CV1TVC4EXA5cmOQWYBG6uqpOAm5vnkjTWDMaSpK6qak9V3dE8fhzYARwHnA9sambbBFwwlAIlqY8cY6wFmW18smOTpeUpyQRwOvAl4Jiq2gOt8Jzk6C6vWQesA3j2s5+9RJVK0sJ4xFiSNKckzwQ+Cbyxqr7f6+uqamNVra6q1StWrBhcgZLUBwZjSdKskhxCKxR/tKo+1TTvTbKymb4S2Des+iSpXwzGkqSukgT4MLCjqt7TNmkLsLZ5vBa4Yalrk6R+c4yxJGk2ZwKvB76S5M6m7a3ABmBzkkuAB4ELh1OeJPWPwViS1FVV3Qqky+Szl7IWSRo0h1JIkiRJGIwlSZIkoIdgPMvlQK9K8nCSO5vbeW2vuTLJziT3JTlnkG9AkiRJ6odexhhPXw70jiSHA7cnuamZ9t6qelf7zM2lQi8CTgWOBT6X5PlVtb+fhUuSJEn9NOcR41kuB9rN+cB1VfVEVT0A7ATO6EexkiRJ0qDMa4zxjMuBAlyW5O4kVyc5omk7Dnio7WW76RCkk6xLsi3JtqmpqflXLkmSJPVRz8G4w+VAPwA8F1gF7AHePT1rh5fXzzV4mVBJkiSNkJ6CcafLgVbV3qraX1VPAR/kp8MldgMntL38eOCR/pUsSZIk9V8vZ6XoeDnQJCvbZnsNsL15vAW4KMmhSU4ETgJu61/JkiRJUv/1claKbpcDvTjJKlrDJHYBbwCoqnuSbAbupXVGi0s9I4UkSZJG3ZzBeJbLgd44y2vWA+sXUZckSZK0pLzynSRJkoTBWJIkSQIMxpIkSRJgMJYkSZIAg7EkSZIEGIwlSZIkwGAsSZIkAQZjSZIkCTAYS5IkSUBvl4TWEpiY3Np12q4Na5awEkmSpAOTR4wlSZIkDMaSJEkSYDCWJEmSAIOxJEmSBBiMJUmSJMBgLEmSJAEGY0mSJAkwGEuSJEmAwViSJEkCDMaSJEkSYDCWJEmSAIOxJEmSBBiMJUmSJMBgLEmSJAEGY0mSJAmAg4ddwHIyMbl11um7NqxZokokSZI0Xx4xliRJkjAYS5IkSYDBWJIkSQIMxpIkSRJgMJYkSZIAg7EkSZIEGIwlSZIkwGAsSZIkAQZjSZIkCTAYS5IkSYDBWJIkSQIMxpIkSRJgMJYkSZIAg7EkSZIEGIwlSZIkAA4edgE68ExMbu06bdeGNUtYiSRJ0k95xFiSJEnCYCxJkiQBBmNJkiQJ6CEYJzkhyeeT7EhyT5LLm/Yjk9yU5P7m/oi211yZZGeS+5KcM8g3IEmSJPVDLz++exK4oqruSHI4cHuSm4DfAW6uqg1JJoFJ4C1JTgEuAk4FjgU+l+T5VbV/MG9BBxJ/uCdJkgZlziPGVbWnqu5oHj8O7ACOA84HNjWzbQIuaB6fD1xXVU9U1QPATuCMPtctSZIk9dW8xhgnmQBOB74EHFNVe6AVnoGjm9mOAx5qe9nupk2SJEkaWT0H4yTPBD4JvLGqvj/brB3aqsPy1iXZlmTb1NRUr2VIkiRJA9FTME5yCK1Q/NGq+lTTvDfJymb6SmBf074bOKHt5ccDj8xcZlVtrKrVVbV6xYoVC61fkiRJ6otezkoR4MPAjqp6T9ukLcDa5vFa4Ia29ouSHJrkROAk4Lb+lSxJWipJrk6yL8n2trarkjyc5M7mdt4wa5SkfunlrBRnAq8HvpLkzqbtrcAGYHOSS4AHgQsBquqeJJuBe2md0eJSz0ghSWPrGuC/AX8+o/29VfWupS9HkgZnzmBcVbfSedwwwNldXrMeWL+IuiRJI6Cqbml+eC1Jy55XvpMkLcRlSe5uhloc0W0mf2gtaZwYjCVJ8/UB4LnAKmAP8O5uM/pDa0njpJcxxgccr64mSd1V1d7px0k+CHx6iOVIUt94xFiSNC/Tp+psvAbY3m1eSRonHjGWJHWV5FrgLOCoJLuBdwBnJVlF6+JNu4A3DKs+Seong7EkqauqurhD84eXvBBJWgIOpZAkSZIwGEuSJEmAwViSJEkCDMaSJEkSYDCWJEmSAIOxJEmSBBiMJUmSJMBgLEmSJAHL+AIfE5Nbu07btWHNElYiSZKkceARY0mSJAmDsSRJkgQYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkiRJwDI+XZvG02yn2ZMkSRokg7EOCHMFbs9tLUmSHEohSZIkYTCWJEmSAIOxJEmSBBiMJUmSJMBgLEmSJAEGY0mSJAkwGEuSJEmAwViSJEkCDMaSJEkSYDCWJEmSAIOxJEmSBBiMJUmSJMBgLEmSJAEGY0mSJAkwGEuSJEkAHDzsAqR+mZjcOuwSJEnSGPOIsSRJkoTBWJIkSQIcSiEBsw/D2LVhzRJWIkmShsUjxpIkSRIGY0mSJAkwGEuSJElAD8E4ydVJ9iXZ3tZ2VZKHk9zZ3M5rm3Zlkp1J7ktyzqAKlyRJkvqplyPG1wDndmh/b1Wtam43AiQ5BbgIOLV5zfuTHNSvYiVJkqRBmTMYV9UtwGM9Lu984LqqeqKqHgB2Amcsoj5JkiRpSSxmjPFlSe5uhloc0bQdBzzUNs/upk2SJEkaaQsNxh8AngusAvYA727a02He6rSAJOuSbEuybWpqaoFlSJIkSf2xoGBcVXuran9VPQV8kJ8Ol9gNnNA26/HAI12WsbGqVlfV6hUrViykDEmSJKlvFhSMk6xse/oaYPqMFVuAi5IcmuRE4CTgtsWVKEmSJA3enJeETnItcBZwVJLdwDuAs5KsojVMYhfwBoCquifJZuBe4Eng0qraP5DKpTHgpaYlSRofcwbjqrq4Q/OHZ5l/PbB+MUVJkiRJS23OYKwD12xHOyVJkpYbg7E0B4dDSJJ0YDAYL3Me9ZUkSerNYi7wIUmSJC0bHjFW33mUWpIkjSOPGEuSJEkYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkiRJgMFYkiRJAgzGkiRJEmAwliRJkgCDsSRpFkmuTrIvyfa2tiOT3JTk/ub+iGHWKEn9YjCWJM3mGuDcGW2TwM1VdRJwc/NcksaewViS1FVV3QI8NqP5fGBT83gTcMFS1iRJg2IwliTN1zFVtQeguT96yPVIUl8YjCVJA5NkXZJtSbZNTU0NuxyNoYnJrcMuQQeQg4ddgDTO7LB1gNqbZGVV7UmyEtjXbcaq2ghsBFi9enUtVYGStBAeMZYkzdcWYG3zeC1wwxBrkaS+MRhLkrpKci3wReDkJLuTXAJsAF6e5H7g5c1zSRp7DqWQJHVVVRd3mXT2khYiSUvAI8aSJEkSBmNJkiQJMBhLkiRJgMFYkiRJAgzGkiRJEmAwliRJkgCDsSRJkgR4HmNpaGa7nPSuDWuWsBJJkgQeMZYkSZIAjxhLy45HoiVJWhiDsTSCZgu3YMCVJGkQHEohSZIkYTCWJEmSAIOxJEmSBBiMJUmSJMBgLEmSJAEGY0mSJAkwGEuSJEmAwViSJEkCDMaSJEkSYDCWJEmSAIOxJEmSBBiMJUmSJAAOHnYBkuZvYnLrsEuQ1JiY3MquDWuGvgxJizfnEeMkVyfZl2R7W9uRSW5Kcn9zf0TbtCuT7ExyX5JzBlW4JEmS1E+9DKW4Bjh3RtskcHNVnQTc3DwnySnARcCpzWven+SgvlUrSZIkDcicwbiqbgEem9F8PrCpebwJuKCt/bqqeqKqHgB2Amf0p1RJkiRpcBb647tjqmoPQHN/dNN+HPBQ23y7m7afk2Rdkm1Jtk1NTS2wDEmSJKk/+n1WinRoq04zVtXGqlpdVatXrFjR5zIkSZKk+VloMN6bZCVAc7+vad8NnNA23/HAIwsvT5IkSVoaCz1d2xZgLbChub+hrf1jSd4DHAucBNy22CIPdJ6aS5IkafDmDMZJrgXOAo5Ksht4B61AvDnJJcCDwIUAVXVPks3AvcCTwKVVtX9AtUuSJEl9M2cwrqqLu0w6u8v864H1iylKkiRJWmpe+U7ST8w2bMerckmSljuDsXQAcby6JEnd9ft0bZIkSdJYMhhLkiRJGIwlSZqVQ5BGh38LDZpjjCUt2lw7K3+4J0kaBx4xliRJkjAYS5IkSYDBWJIkSQIMxpIkSRJgMJYkSZIAg7EkSZIEeLo2ST3y/KGSpOXOI8aSJEkSHjGWNMZmO4rtRUUkSfPlEWNJkiQJg7EkSZIEGIwlSZIkwDHGkoZsrrNdOFZYo2BicqufRekA4BFjSZIkCYOxJEmSBBiMJUmSJMBgLEmSJAEGY0mSJAkwGEuSJEmAwViSJEkCDMaSJEkS4AU+JI24uS4AIklSvxiMJQ2c4VaSNA4MxpKkBUmyC3gc2A88WVWrh1uRJC2OwXgJedRM0jL00qp6dNhFSFI/+OM7SZIkCYOxJGnhCvhsktuTrOs0Q5J1SbYl2TY1NbXoFY7CN2/tNYxCPZL6x2AsSVqoM6vqRcBvAZcmecnMGapqY1WtrqrVK1asWPoKJWkeDMaSpAWpqkea+33A9cAZw61IkhbHYCxJmrckhyU5fPox8Apg+3CrkqTF8awUkqSFOAa4Pgm09iUfq6rPDLckSVocg7Ekad6q6hvAacOuQ5L6yaEUkiRJEgZjSZIkCTAYS5IkSYDBWJIkSQL88d28eZUjSZKk5ckjxpIkSRJjfsTYo7eSJEnql0UF4yS7gMeB/cCTVbU6yZHAx4EJYBfw2qr6zuLKlCRJkgarH0eMX1pVj7Y9nwRurqoNSSab52/pw3r6xiPNkjR+RqHv7lbDxORWdm1Y83OPZ1vOfOfp5TWLMejlL8Qo/M11YBnEGOPzgU3N403ABQNYhyRJktRXiw3GBXw2ye1J1jVtx1TVHoDm/uhOL0yyLsm2JNumpqYWWYYkSZK0OIsdSnFmVT2S5GjgpiRf7fWFVbUR2AiwevXqWmQdkiRJ0qIs6ohxVT3S3O8DrgfOAPYmWQnQ3O9bbJGSJEnSoC04GCc5LMnh04+BVwDbgS3A2ma2tcANiy1SkiRJGrTFDKU4Brg+yfRyPlZVn0nyZWBzkkuAB4ELF1+mJEmSNFgLDsZV9Q3gtA7t3wbOXkxRkiRJ0lLzktCSJEkSBmNJkiQJMBhLkiRJgMFYkiRJAgzGkiRJEmAwliRJkgCDsSRpGZmY3Lqo10xMbu1pGd3mmW6fucyFLGO+65/v+1jItpKWO4OxJEmShMFYkiRJAgzGkiRJEmAwliRJkgCDsSRJkgQYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkiRJgMFYkiRJAgzGkiRJEmAwliRJkgCDsSRJkgQYjCVJkiTAYCxJWsYmJrcuaL5eXzffOiYmt/7Msrutp5f197KcmeteyHra551r/m7rmO965jtvv/9eOnAZjCVJkiQMxpIkSRJgMJYkSZIAg7EkSZIEGIwlSZIkwGAsSZIkAQZjSZIkCTAYS5IkSYDBWJIkSQIMxpIkSRJgMJYkSZIAg7EkSZIEGIwlSZIkwGAsSRphE5Nbu7ZP37q9Zua0Tu3dlt++jm7zzvW8W1291jvXtF7WM/P1vdY4cxlz1d5rjZ22Wbf33cvfb2b7bLXOVed8/5696McylqtR3TYGY0mSJAmDsSRJkgQYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkiRJgMFYkiRJAgzGkiRJEmAwliRJkoABBuMk5ya5L8nOJJODWo8kaTjs5yUtNwMJxkkOAt4H/BZwCnBxklMGsS5J0tKzn5e0HA3qiPEZwM6q+kZV/QNwHXD+gNYlSVp69vOSlp1BBePjgIfanu9u2iRJy4P9vKRlJ1XV/4UmFwLnVNW/bp6/Hjijqv5t2zzrgHXN05OB+xawqqOARxdZbr+NYk1gXfNlXb0bxZpgYXU9p6pWDKKY5aaXfr5pX2xfP6qfr5mss7+ss7+s82d17esPHtAKdwMntD0/HnikfYaq2ghsXMxKkmyrqtWLWUa/jWJNYF3zZV29G8WaYHTrWkbm7Odh8X39uPwdrbO/rLO/rLN3gxpK8WXgpCQnJvkF4CJgy4DWJUlaevbzkpadgRwxrqonk1wG/BVwEHB1Vd0ziHVJkpae/byk5WhQQymoqhuBGwe1/MaihmIMyCjWBNY1X9bVu1GsCUa3rmXjAO7nO7HO/rLO/rLOHg3kx3eSJEnSuPGS0JIkSRJjEoyTXJ1kX5LtbW1XJXk4yZ3N7bwh1HVCks8n2ZHkniSXN+1HJrkpyf3N/REjUtfQtlmSpye5LcldTU1/1LQPe1t1q2von6+mjoOS/H2STzfPh7q9utQ0KttqV5KvNDVsa9qGvr20MKN8ueku+6SR+qyN6v6pQ50juW/oZhT75A41jkVfmORZST6R5KvN5/TXRqHOsQjGwDXAuR3a31tVq5rboMe5dfIkcEVVvRB4MXBpWpdEnQRurqqTgJub56NQFwxvmz0B/GZVnQasAs5N8mKGv6261QXD/3wBXA7saHs+7O3VqSYYjW0F8NKmhunT/YzC9tI8ZfQvN30NP79PGrXP2qjun2Ya1X1DN6PYJ3cyDn3hnwKfqaoXAKfR2q5Dr3MsgnFV3QI8Nuw6ZqqqPVV1R/P4cVp/1ONoXRZ1UzPbJuCCEalraKrlB83TQ5pbMfxt1a2uoUtyPLAG+FBb81C3V5eaRtlQt5cWbKQvN91lnzRSn7VR3T/NNKr7hk5GsU+eh5GqM8k/Al4CfBigqv6hqr7LCNQ5FsF4Fpclubv5WmvYXwdNAKcDXwKOqao90OqcgKNHpC4Y4jZrvoK6E9gH3FRVI7GtutQFw/98/QnwZuCptrZhb69ONcHwtxW0dqafTXJ7Wldbg+FvLy3MOF5uemQ/a6O6f5o2qvuGDv6E0euTOxmHvvCXgSngfzRDUz6U5DBGoM5xDsYfAJ5L66uXPcC7h1VIkmcCnwTeWFXfH1YdM3Woa6jbrKr2V9UqWlfIOiPJryzl+rvpUtdQt1WSVwL7qur2pVzvbGapaVT+LZ5ZVS+i9fX7pUleMqQ6tHjp0DYS3+SMm1HdP7Ub1X1Du1Hsk2cxDn3hwcCLgA9U1enADxmN4R3jG4yram/zj+kp4IO0vnpbckkOodXpfLSqPtU0702yspm+ktb/gode16hss+brki/QGqM39G3Vqa4R2FZnAq9OsovW18i/meQvGO726ljTCGwrAKrqkeZ+H3B9U8fIfL40Lz1dbnrEjNxnbVT3T92M6r6hMYp9ckdj0hfuBna3fUP7CVpBeeh1jm0wnt5wjdcA27vNO8AaQmt8zI6qek/bpC3A2ubxWuCGUahrmNssyYokz2oePwN4GfBVhr+tOtY17M9XVV1ZVcdX1QStS+3+dVW9jiFur241DXtbASQ5LMnh04+BVzR1DPXzpQUbx8tNj9RnbVT3TzON6r5hplHskzsZl76wqr4FPJTk5KbpbOBeRqHOqhr5G3Atra9of0zrfxmXAB8BvgLc3WzIlUOo69dpfb13N3BnczsP+Me0fk15f3N/5IjUNbRtBvxT4O+bdW8H3t60D3tbdatr6J+vthrPAj49CturS01D31a0xqvd1dzuAd42StvL24L+pucBXwO+Pv33HJVbl33SSH3WRnX/1KHOkdw3zFHzyPXJbbWNTV9Ia/jdtuZv/7+AI0ahTq98J0mSJDHGQykkSZKkfjIYS5IkSRiMJUmSJMBgLEmSJAEGY0mSJAkwGEuSJEmAwViSJEkCDMaSJEkSAP8fVjs2ooq1mV0AAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 864x432 with 2 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "display_dataset(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3337\n"
     ]
    }
   ],
   "source": [
    "filtered_cids = []\n",
    "for cid in list(set(df[\"case:concept:name\"].values)):\n",
    "    this_df = df[df[\"case:concept:name\"]==cid]\n",
    "    true_value = this_df.iloc[0]['total_time_true']\n",
    "    if true_value >= 7 and true_value <= 30:\n",
    "        if this_df.shape[0] >= 15 and this_df.shape[0] <= 50:\n",
    "            filtered_cids.append(cid)\n",
    "print(len(filtered_cids))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "200 50\n",
      "(8019, 8)\n"
     ]
    }
   ],
   "source": [
    "df_samples, train_cids, test_cids = sample_data(df, filtered_cids, 200, 50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "ERROR:root:[Errno 2] No such file or directory: '/Users/wangnaixuan/Documents/GitHub/prediction_service/data/drl_data/bpic2012/bpic2012_sample250.csv': read sample file failed.\n"
     ]
    }
   ],
   "source": [
    "from config import DeepReinforceLearningParameters\n",
    "\n",
    "save_path = DeepReinforceLearningParameters(dataset_name=\"bpic2012\").DATA_PATH\n",
    "\n",
    "df_samples.to_csv(save_path+\"/bpic2012_sample250.csv\", index=False)\n",
    "np.save(save_path+\"/train_case_ids.npy\", np.array(train_cids))\n",
    "np.save(save_path+\"/test_case_ids.npy\", np.array(test_cids))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### case 2: 读取case id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "train_cids = np.load(\"../data/drl_data/bpic2012/train_case_ids.npy\")\n",
    "test_cids = np.load(\"../data/drl_data/bpic2012/test_case_ids.npy\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(8019, 8)\n"
     ]
    }
   ],
   "source": [
    "df_sample = pd.DataFrame()\n",
    "for cid in set(df[\"case:concept:name\"]):\n",
    "    if cid in train_cids or cid in test_cids:\n",
    "        df_sample = df_sample.append(df[df[\"case:concept:name\"] == cid])\n",
    "print(df_sample.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "from config import DRLParameters\n",
    "\n",
    "save_path = DRLParameters(dataset_name=\"bpic2012\").DATA_PATH\n",
    "\n",
    "df_sample.to_csv(save_path+\"/bpic2012_nn_sample250.csv\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### BPIC2018数据集处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c4d632f7349c4cb3ac08b292f9c4fdef",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(HTML(value='parsing log, completed traces :: '), FloatProgress(value=0.0, max=43809.0), HTML(va…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "data_path = \"../data/raw_data/BPIC2018/12688355/BPI Challenge 2018.xes\"\n",
    "\n",
    "import pm4py\n",
    "\n",
    "log = pm4py.read_xes(data_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pm4py.convert_to_dataframe(log)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>success</th>\n",
       "      <th>org:resource</th>\n",
       "      <th>docid_uuid</th>\n",
       "      <th>doctype</th>\n",
       "      <th>subprocess</th>\n",
       "      <th>docid</th>\n",
       "      <th>activity</th>\n",
       "      <th>note</th>\n",
       "      <th>eventid</th>\n",
       "      <th>identity:id</th>\n",
       "      <th>...</th>\n",
       "      <th>case:concept:name</th>\n",
       "      <th>case:penalty_amount1</th>\n",
       "      <th>case:payment_actual1</th>\n",
       "      <th>case:amount_applied1</th>\n",
       "      <th>case:penalty_amount2</th>\n",
       "      <th>case:payment_actual2</th>\n",
       "      <th>case:amount_applied2</th>\n",
       "      <th>case:penalty_amount3</th>\n",
       "      <th>case:payment_actual3</th>\n",
       "      <th>case:amount_applied3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>True</td>\n",
       "      <td>0;n/a</td>\n",
       "      <td>CD3DC291-76C6-420A-B3F1-7C808970915B</td>\n",
       "      <td>Payment application</td>\n",
       "      <td>Application</td>\n",
       "      <td>-18008611495569447</td>\n",
       "      <td>mail income</td>\n",
       "      <td>none</td>\n",
       "      <td>null</td>\n",
       "      <td>510B5333-731A-40FD-B7D6-FC149E50E961</td>\n",
       "      <td>...</td>\n",
       "      <td>8b99873a6136cfa6</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>True</td>\n",
       "      <td>0;n/a</td>\n",
       "      <td>CD3DC291-76C6-420A-B3F1-7C808970915B</td>\n",
       "      <td>Payment application</td>\n",
       "      <td>Application</td>\n",
       "      <td>-18008611495569447</td>\n",
       "      <td>mail valid</td>\n",
       "      <td>none</td>\n",
       "      <td>null</td>\n",
       "      <td>F1DD45EF-80BF-46A5-97D6-CC5886DD2D23</td>\n",
       "      <td>...</td>\n",
       "      <td>8b99873a6136cfa6</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>True</td>\n",
       "      <td>0;n/a</td>\n",
       "      <td>7CB69360-6D10-426F-A426-DDE3E24E4334</td>\n",
       "      <td>Entitlement application</td>\n",
       "      <td>Main</td>\n",
       "      <td>-18008615298673397</td>\n",
       "      <td>mail valid</td>\n",
       "      <td>none</td>\n",
       "      <td>null</td>\n",
       "      <td>AA02CA32-D021-4264-A7CB-660A9D603EFC</td>\n",
       "      <td>...</td>\n",
       "      <td>8b99873a6136cfa6</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>True</td>\n",
       "      <td>0;n/a</td>\n",
       "      <td>7CB69360-6D10-426F-A426-DDE3E24E4334</td>\n",
       "      <td>Entitlement application</td>\n",
       "      <td>Main</td>\n",
       "      <td>-18008615298673397</td>\n",
       "      <td>mail valid</td>\n",
       "      <td>none</td>\n",
       "      <td>null</td>\n",
       "      <td>097D1E41-3CDB-4652-ABF1-EAEFC0410FA0</td>\n",
       "      <td>...</td>\n",
       "      <td>8b99873a6136cfa6</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>True</td>\n",
       "      <td>fb5fa8</td>\n",
       "      <td>CCBAA174-CDD7-4D32-892E-F14197C65B8A</td>\n",
       "      <td>Parcel document</td>\n",
       "      <td>Main</td>\n",
       "      <td>-72051858488795160</td>\n",
       "      <td>initialize</td>\n",
       "      <td>none</td>\n",
       "      <td>-72051858488795157</td>\n",
       "      <td>96CBE6E6-9774-4DF8-842B-073F4FDCE2B8</td>\n",
       "      <td>...</td>\n",
       "      <td>8b99873a6136cfa6</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2514261</th>\n",
       "      <td>True</td>\n",
       "      <td>155add</td>\n",
       "      <td>DA593697-DEA9-4066-9D1B-07BB9A9D3026</td>\n",
       "      <td>Payment application</td>\n",
       "      <td>Application</td>\n",
       "      <td>-54037160734808716</td>\n",
       "      <td>decide</td>\n",
       "      <td>automatic</td>\n",
       "      <td>-90065940805989067</td>\n",
       "      <td>0215FFEC-132B-4EB9-9CD8-447822624E2D</td>\n",
       "      <td>...</td>\n",
       "      <td>ad5dfa0b929735be</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2514262</th>\n",
       "      <td>True</td>\n",
       "      <td>DP-Z</td>\n",
       "      <td>DA593697-DEA9-4066-9D1B-07BB9A9D3026</td>\n",
       "      <td>Payment application</td>\n",
       "      <td>Application</td>\n",
       "      <td>-54037160734808716</td>\n",
       "      <td>begin payment</td>\n",
       "      <td>automatic during payment</td>\n",
       "      <td>-90065939195480795</td>\n",
       "      <td>FB7EE21E-49C3-4354-96AF-71F54D6B9EBB</td>\n",
       "      <td>...</td>\n",
       "      <td>ad5dfa0b929735be</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2514263</th>\n",
       "      <td>True</td>\n",
       "      <td>Notification automaton</td>\n",
       "      <td>DA593697-DEA9-4066-9D1B-07BB9A9D3026</td>\n",
       "      <td>Payment application</td>\n",
       "      <td>Application</td>\n",
       "      <td>-54037160734808716</td>\n",
       "      <td>insert document</td>\n",
       "      <td>notification for applicant</td>\n",
       "      <td>-90065939195044055</td>\n",
       "      <td>D3AAFD05-E50B-4938-8577-EAA0FB5F4738</td>\n",
       "      <td>...</td>\n",
       "      <td>ad5dfa0b929735be</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2514264</th>\n",
       "      <td>False</td>\n",
       "      <td>Notification automaton</td>\n",
       "      <td>DA593697-DEA9-4066-9D1B-07BB9A9D3026</td>\n",
       "      <td>Payment application</td>\n",
       "      <td>Application</td>\n",
       "      <td>-54037160734808716</td>\n",
       "      <td>insert document</td>\n",
       "      <td>notification for applicant</td>\n",
       "      <td>-126094734587029664</td>\n",
       "      <td>85F5B55E-107C-4034-B616-5FDBB076BFAD</td>\n",
       "      <td>...</td>\n",
       "      <td>ad5dfa0b929735be</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2514265</th>\n",
       "      <td>True</td>\n",
       "      <td>DP-Z</td>\n",
       "      <td>DA593697-DEA9-4066-9D1B-07BB9A9D3026</td>\n",
       "      <td>Payment application</td>\n",
       "      <td>Application</td>\n",
       "      <td>-54037160734808716</td>\n",
       "      <td>finish payment</td>\n",
       "      <td>automatic during payment</td>\n",
       "      <td>-18008338125604638</td>\n",
       "      <td>1518212C-3E0C-4EAC-A144-8EFB8D866494</td>\n",
       "      <td>...</td>\n",
       "      <td>ad5dfa0b929735be</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2514266 rows × 75 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         success            org:resource  \\\n",
       "0           True                   0;n/a   \n",
       "1           True                   0;n/a   \n",
       "2           True                   0;n/a   \n",
       "3           True                   0;n/a   \n",
       "4           True                  fb5fa8   \n",
       "...          ...                     ...   \n",
       "2514261     True                  155add   \n",
       "2514262     True                    DP-Z   \n",
       "2514263     True  Notification automaton   \n",
       "2514264    False  Notification automaton   \n",
       "2514265     True                    DP-Z   \n",
       "\n",
       "                                   docid_uuid                  doctype  \\\n",
       "0        CD3DC291-76C6-420A-B3F1-7C808970915B      Payment application   \n",
       "1        CD3DC291-76C6-420A-B3F1-7C808970915B      Payment application   \n",
       "2        7CB69360-6D10-426F-A426-DDE3E24E4334  Entitlement application   \n",
       "3        7CB69360-6D10-426F-A426-DDE3E24E4334  Entitlement application   \n",
       "4        CCBAA174-CDD7-4D32-892E-F14197C65B8A          Parcel document   \n",
       "...                                       ...                      ...   \n",
       "2514261  DA593697-DEA9-4066-9D1B-07BB9A9D3026      Payment application   \n",
       "2514262  DA593697-DEA9-4066-9D1B-07BB9A9D3026      Payment application   \n",
       "2514263  DA593697-DEA9-4066-9D1B-07BB9A9D3026      Payment application   \n",
       "2514264  DA593697-DEA9-4066-9D1B-07BB9A9D3026      Payment application   \n",
       "2514265  DA593697-DEA9-4066-9D1B-07BB9A9D3026      Payment application   \n",
       "\n",
       "          subprocess               docid         activity  \\\n",
       "0        Application  -18008611495569447      mail income   \n",
       "1        Application  -18008611495569447       mail valid   \n",
       "2               Main  -18008615298673397       mail valid   \n",
       "3               Main  -18008615298673397       mail valid   \n",
       "4               Main  -72051858488795160       initialize   \n",
       "...              ...                 ...              ...   \n",
       "2514261  Application  -54037160734808716           decide   \n",
       "2514262  Application  -54037160734808716    begin payment   \n",
       "2514263  Application  -54037160734808716  insert document   \n",
       "2514264  Application  -54037160734808716  insert document   \n",
       "2514265  Application  -54037160734808716   finish payment   \n",
       "\n",
       "                               note              eventid  \\\n",
       "0                              none                 null   \n",
       "1                              none                 null   \n",
       "2                              none                 null   \n",
       "3                              none                 null   \n",
       "4                              none   -72051858488795157   \n",
       "...                             ...                  ...   \n",
       "2514261                   automatic   -90065940805989067   \n",
       "2514262    automatic during payment   -90065939195480795   \n",
       "2514263  notification for applicant   -90065939195044055   \n",
       "2514264  notification for applicant  -126094734587029664   \n",
       "2514265    automatic during payment   -18008338125604638   \n",
       "\n",
       "                                  identity:id  ... case:concept:name  \\\n",
       "0        510B5333-731A-40FD-B7D6-FC149E50E961  ...  8b99873a6136cfa6   \n",
       "1        F1DD45EF-80BF-46A5-97D6-CC5886DD2D23  ...  8b99873a6136cfa6   \n",
       "2        AA02CA32-D021-4264-A7CB-660A9D603EFC  ...  8b99873a6136cfa6   \n",
       "3        097D1E41-3CDB-4652-ABF1-EAEFC0410FA0  ...  8b99873a6136cfa6   \n",
       "4        96CBE6E6-9774-4DF8-842B-073F4FDCE2B8  ...  8b99873a6136cfa6   \n",
       "...                                       ...  ...               ...   \n",
       "2514261  0215FFEC-132B-4EB9-9CD8-447822624E2D  ...  ad5dfa0b929735be   \n",
       "2514262  FB7EE21E-49C3-4354-96AF-71F54D6B9EBB  ...  ad5dfa0b929735be   \n",
       "2514263  D3AAFD05-E50B-4938-8577-EAA0FB5F4738  ...  ad5dfa0b929735be   \n",
       "2514264  85F5B55E-107C-4034-B616-5FDBB076BFAD  ...  ad5dfa0b929735be   \n",
       "2514265  1518212C-3E0C-4EAC-A144-8EFB8D866494  ...  ad5dfa0b929735be   \n",
       "\n",
       "        case:penalty_amount1 case:payment_actual1  case:amount_applied1  \\\n",
       "0                        NaN                  NaN                   NaN   \n",
       "1                        NaN                  NaN                   NaN   \n",
       "2                        NaN                  NaN                   NaN   \n",
       "3                        NaN                  NaN                   NaN   \n",
       "4                        NaN                  NaN                   NaN   \n",
       "...                      ...                  ...                   ...   \n",
       "2514261                  NaN                  NaN                   NaN   \n",
       "2514262                  NaN                  NaN                   NaN   \n",
       "2514263                  NaN                  NaN                   NaN   \n",
       "2514264                  NaN                  NaN                   NaN   \n",
       "2514265                  NaN                  NaN                   NaN   \n",
       "\n",
       "         case:penalty_amount2  case:payment_actual2 case:amount_applied2  \\\n",
       "0                         NaN                   NaN                  NaN   \n",
       "1                         NaN                   NaN                  NaN   \n",
       "2                         NaN                   NaN                  NaN   \n",
       "3                         NaN                   NaN                  NaN   \n",
       "4                         NaN                   NaN                  NaN   \n",
       "...                       ...                   ...                  ...   \n",
       "2514261                   NaN                   NaN                  NaN   \n",
       "2514262                   NaN                   NaN                  NaN   \n",
       "2514263                   NaN                   NaN                  NaN   \n",
       "2514264                   NaN                   NaN                  NaN   \n",
       "2514265                   NaN                   NaN                  NaN   \n",
       "\n",
       "         case:penalty_amount3 case:payment_actual3  case:amount_applied3  \n",
       "0                         NaN                  NaN                   NaN  \n",
       "1                         NaN                  NaN                   NaN  \n",
       "2                         NaN                  NaN                   NaN  \n",
       "3                         NaN                  NaN                   NaN  \n",
       "4                         NaN                  NaN                   NaN  \n",
       "...                       ...                  ...                   ...  \n",
       "2514261                   NaN                  NaN                   NaN  \n",
       "2514262                   NaN                  NaN                   NaN  \n",
       "2514263                   NaN                  NaN                   NaN  \n",
       "2514264                   NaN                  NaN                   NaN  \n",
       "2514265                   NaN                  NaN                   NaN  \n",
       "\n",
       "[2514266 rows x 75 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1000 395\n",
      "2000 782\n",
      "3000 1114\n",
      "4000 1483\n",
      "5000 1829\n",
      "6000 2199\n",
      "7000 2570\n",
      "8000 2954\n",
      "9000 3333\n",
      "10000 3744\n",
      "11000 4146\n",
      "12000 4544\n",
      "13000 4919\n",
      "14000 5313\n",
      "15000 5668\n",
      "16000 6047\n",
      "17000 6452\n",
      "18000 6824\n",
      "19000 7193\n",
      "20000 7571\n",
      "21000 7951\n",
      "22000 8344\n",
      "23000 8711\n",
      "24000 9097\n",
      "25000 9452\n",
      "26000 9836\n",
      "27000 10192\n",
      "28000 10581\n",
      "29000 10954\n",
      "30000 11333\n",
      "31000 11692\n",
      "32000 12084\n",
      "33000 12471\n",
      "34000 12879\n",
      "35000 13276\n",
      "36000 13634\n",
      "37000 13979\n",
      "38000 14341\n",
      "39000 14716\n",
      "40000 15105\n",
      "41000 15484\n",
      "42000 15870\n",
      "43000 16249\n"
     ]
    }
   ],
   "source": [
    "valid_cnts = []\n",
    "cid_cnt = 0\n",
    "for cid in set(df[\"case:concept:name\"]):\n",
    "    cid_cnt += 1\n",
    "    if cid_cnt % 1000 == 0:\n",
    "        print(cid_cnt, len(valid_cnts))\n",
    "    tmp_df = df[df[\"case:concept:name\"] == cid]\n",
    "    if tmp_df.shape[0] >= 50 and tmp_df.shape[0] <= 75:\n",
    "        valid_cnts.append(cid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 72d0460c4a2536e5\n",
      "100 8d97a6d4fe23d8bb\n",
      "200 1c969fed18db96a0\n",
      "300 c9d435310282323a\n",
      "400 6f66c9fe46a93345\n",
      "500 7648f09438ba25b2\n",
      "600 e434344636b1e716\n",
      "700 92b87bff925180df\n",
      "800 efd2e0a6b5cafc03\n",
      "900 b85b613eef84bf01\n",
      "1000 f7159334b7d564fa\n",
      "1100 62171c3c51aa87c0\n",
      "1200 1656e83811afbaa8\n",
      "1300 d6ed46c34ac7dd6e\n",
      "1400 66fa48d3fe74ace2\n",
      "1500 7565cb7c465e551b\n",
      "1600 b6450aa3feb00052\n",
      "1700 587c2dfdbfa46864\n",
      "1800 62fe4831b3306a5e\n",
      "1900 733db95cc9189835\n",
      "2000 131beb19276dbcb5\n",
      "2100 7c573baea6762e56\n",
      "2200 a4e8945d46cc18c0\n",
      "2300 2dc853cddb4e39da\n",
      "2400 b976158652d10f9d\n",
      "2500 d5bf691673159c78\n",
      "2600 2b4753c24f077255\n",
      "2700 38b1c6ebad4bef33\n",
      "2800 bbd6b53ca8886dae\n",
      "2900 29c729dfa83358c4\n",
      "3000 fb8034411e6267d9\n",
      "3100 1c3f5eae3a359490\n",
      "3200 971e40bcf18524e0\n",
      "3300 8e3bf01e702579dc\n",
      "3400 f0751f867fbc6f0e\n",
      "3500 b92166e94dcfcf88\n",
      "3600 1058a984173b0b79\n",
      "3700 922f38a4c400cac6\n",
      "3800 2e4c0931bac45914\n",
      "3900 46243dd357e01d87\n",
      "4000 319b28d56aac5682\n",
      "4100 e2b4c5f8103f7f83\n",
      "4200 3123bb7dcb5cfaf7\n",
      "4300 acb1c6646d7c02b1\n",
      "4400 e37162da235820dd\n",
      "4500 461ebb54c31b3690\n",
      "4600 0ccaa548e349e3c8\n",
      "4700 714341549328f6db\n",
      "4800 7f8b0cc362c3b731\n",
      "4900 428449ae24f92cd3\n",
      "5000 0626260c9be4edfc\n",
      "5100 4f258e819a8e23ae\n",
      "5200 ddf28fb47a872597\n",
      "5300 ab9c01a2dc642a45\n",
      "5400 caafea30062a70ed\n",
      "5500 c9246bd691a1f401\n",
      "5600 d81811b81a2c979b\n",
      "5700 bd1279e44e98f9c5\n",
      "5800 4799d919846d4712\n",
      "5900 8231b49f61ab076d\n",
      "6000 f123d1b241d229b7\n",
      "6100 8849c77ec435e3df\n",
      "6200 3c796698a24a048f\n",
      "6300 b1c87ebaf8df46e6\n",
      "6400 a457b4a2cc6f0bd2\n",
      "6500 1a0b647fb3964aa0\n",
      "6600 bbbbf644b95ccd52\n",
      "6700 6c110023fecb3205\n",
      "6800 a25ebaf2d68ca6cb\n",
      "6900 eaea2267ac2105e7\n",
      "7000 77e67dd2c76d6e0c\n",
      "7100 ad471ab5666b4023\n",
      "7200 073dd6b516dc27b0\n",
      "7300 c36a2802e7b916e6\n",
      "7400 77f595f29fc1c0cb\n",
      "7500 ae8c8dab1957ebaf\n",
      "7600 e4663c2e13a5e36c\n",
      "7700 49207b3261bb9385\n",
      "7800 edd027400075b9a9\n",
      "7900 d90c877818786b0e\n",
      "8000 0dffe64d98b22cbc\n",
      "8100 d6669d7d06fd7652\n",
      "8200 d1a11e800c15c127\n",
      "8300 bfcb158079417c51\n",
      "8400 f91a566104770ea3\n",
      "8500 d568ee551e15afdb\n",
      "8600 f9164921f16980f9\n",
      "8700 c8675de774b1bbb9\n",
      "8800 78fee94b496dff89\n",
      "8900 a077a97e77bc8b01\n",
      "9000 d2b61e63dda60587\n",
      "9100 f0f70092ec02c64d\n",
      "9200 91344a01f3e765d3\n",
      "9300 c812cd4811531f9b\n",
      "9400 d0b34106341619aa\n",
      "9500 c007ef00bdc8fd29\n",
      "9600 f195159e5af1ebd9\n",
      "9700 366f7057b187a614\n",
      "9800 9c1ffa0dc6d4db6e\n",
      "9900 4ae464d124b6bcb4\n",
      "10000 9abcee6119540096\n",
      "10100 aa3db7fc4e80fdc1\n",
      "10200 a2c74503e2f15e64\n",
      "10300 27e05910a8667b72\n",
      "10400 2a5993eb4aa1d632\n",
      "10500 5b376bb84a9c8c40\n",
      "10600 bdf54004912b4683\n",
      "10700 eecd85e2f3b7d97c\n",
      "10800 fc33c8a2f6b08992\n",
      "10900 c7be2a7ddfea1b08\n",
      "11000 fdcdaeafbefc747e\n",
      "11100 45b7c9112af9d62b\n",
      "11200 d987869d8ec35bf0\n",
      "11300 ba3688f188cebc84\n",
      "11400 0864f30a785b0049\n",
      "11500 0d27880c097154b4\n",
      "11600 3c168ebecced3323\n",
      "11700 c4c958d921ad1cab\n",
      "11800 2b3a6aa1f57e490a\n",
      "11900 ad9c0237e300419b\n",
      "12000 2819d560980808ae\n",
      "12100 61e2dd7f351c9a0f\n",
      "12200 398fdbebf68de54b\n",
      "12300 2fe645831b3ad225\n",
      "12400 d44e37e4c1a758fe\n",
      "12500 66ece425b1b936d4\n",
      "12600 b4a682dfbc64ae24\n",
      "12700 a9190ebcef31fcb5\n",
      "12800 4ba16b6c6b45c1ac\n",
      "12900 9a1f346d36d47591\n",
      "13000 c0d0c00ed94367b3\n",
      "13100 305a50fc6f2629bd\n",
      "13200 b1f72f6862511d8a\n",
      "13300 69285da4be57ca28\n",
      "13400 b78fafd052982b6e\n",
      "13500 196556b104757ba1\n",
      "13600 77cefea3a6518629\n",
      "13700 355db1aa960d6719\n",
      "13800 e760cdd27f9c13c6\n",
      "13900 b3153099cf21db4a\n",
      "14000 3480ffadd763c79a\n",
      "14100 17f07afde297eaf8\n",
      "14200 68cf1e2914c11b3e\n",
      "14300 5b481cc31207b4dd\n",
      "14400 9da1493da90b8452\n",
      "14500 a2a229a6153e1899\n",
      "14600 d4e3c6ea556be054\n",
      "14700 b971c599ba2929e3\n",
      "14800 89bc4a7ea531a15a\n",
      "14900 e14428462c78cf36\n",
      "15000 28ed89671b33424a\n",
      "15100 ab807eedd2a24dfe\n",
      "15200 f4c4e6eeb121b6ea\n",
      "15300 e23b5bb2a2b9ed47\n",
      "15400 fee4ea7cc032125d\n",
      "15500 9894de3cbc34abd7\n",
      "15600 5028ad5ed846d328\n",
      "15700 feb1db0565c5554b\n",
      "15800 dfa143c3336dc61d\n",
      "15900 94d1e845dbdb606f\n",
      "16000 38f54c0766d0c851\n",
      "16100 7ecce5c2d1698527\n",
      "16200 daf3635ffdf6b781\n",
      "16300 099761093d9232c7\n",
      "16400 7f20d874731d648a\n",
      "16500 1ae1bcbb97736182\n",
      "(959609, 75)\n"
     ]
    }
   ],
   "source": [
    "df_filtered = pd.DataFrame()\n",
    "\n",
    "cnt = 0\n",
    "for cid in valid_cnts:\n",
    "    if cnt % 100 == 0:\n",
    "        print(cnt, cid)\n",
    "    cnt += 1\n",
    "    this_df = df[df[\"case:concept:name\"] == cid]\n",
    "    df_filtered = pd.concat([df_filtered, this_df])\n",
    "print(df_filtered.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_f = df_filtered[[\"success\", \"doctype\", \"subprocess\", \"concept:name\", \"time:timestamp\", \"case:young farmer\", \"case:selected_random\",\n",
    "                   \"case:penalty_AJLP\", \"case:penalty_amount0\", \"case:penalty_BGKV\", \"case:penalty_AUVP\", \"case:small farmer\",\n",
    "                   \"case:penalty_BGP\", \"case:department\", \"case:penalty_C16\", \"case:penalty_BGK\", \"case:penalty_CC\", \"case:penalty_AVJLP\",\n",
    "                   \"case:penalty_C9\", \"case:cross_compliance\", \"case:rejected\", \"case:penalty_C4\", \"case:penalty_ABP\", \"case:penalty_B6\",\n",
    "                   \"case:penalty_B4\", \"case:penalty_B5\", \"case:penalty_B2\", \"case:selected_risk\", \"case:penalty_B3\", \"case:area\",\n",
    "                   \"case:selected_manually\", \"case:penalty_AGP\", \"case:penalty_B16\", \"case:penalty_GP1\", \"case:penalty_B5F\",\n",
    "                   \"case:penalty_V5\", \"case:payment_actual0\", \"case:amount_applied0\", \"case:redistribution\", \"case:penalty_JLP6\",\n",
    "                   \"case:penalty_JLP5\", \"case:penalty_JLP2\", \"case:penalty_JLP3\", \"case:number_parcels\", \"case:penalty_JLP1\",\n",
    "                   \"case:concept:name\"]]\n",
    "df_f.to_csv(\"../data/filtered_data/BPIC2018/filtered_data.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "5000\n",
      "10000\n",
      "15000\n",
      "20000\n",
      "25000\n",
      "30000\n",
      "35000\n",
      "40000\n",
      "45000\n",
      "50000\n",
      "55000\n",
      "60000\n",
      "65000\n",
      "70000\n",
      "75000\n",
      "80000\n",
      "85000\n",
      "90000\n",
      "95000\n",
      "100000\n",
      "105000\n",
      "110000\n",
      "115000\n",
      "120000\n",
      "125000\n",
      "130000\n",
      "135000\n",
      "140000\n",
      "145000\n",
      "150000\n",
      "155000\n",
      "160000\n",
      "165000\n",
      "170000\n",
      "175000\n",
      "180000\n",
      "185000\n",
      "190000\n",
      "195000\n",
      "200000\n",
      "205000\n",
      "210000\n",
      "215000\n",
      "220000\n",
      "225000\n",
      "230000\n",
      "235000\n",
      "240000\n",
      "245000\n",
      "250000\n",
      "255000\n",
      "260000\n",
      "265000\n",
      "270000\n",
      "275000\n",
      "280000\n",
      "285000\n",
      "290000\n",
      "295000\n",
      "300000\n",
      "305000\n",
      "310000\n",
      "315000\n",
      "320000\n",
      "325000\n",
      "330000\n",
      "335000\n",
      "340000\n",
      "345000\n",
      "350000\n",
      "355000\n",
      "360000\n",
      "365000\n",
      "370000\n",
      "375000\n",
      "380000\n",
      "385000\n",
      "390000\n",
      "395000\n",
      "400000\n",
      "405000\n",
      "410000\n",
      "415000\n",
      "420000\n",
      "425000\n",
      "430000\n",
      "435000\n",
      "440000\n",
      "445000\n",
      "450000\n",
      "455000\n",
      "460000\n",
      "465000\n",
      "470000\n",
      "475000\n",
      "480000\n",
      "485000\n",
      "490000\n",
      "495000\n",
      "500000\n",
      "505000\n",
      "510000\n",
      "515000\n",
      "520000\n",
      "525000\n",
      "530000\n",
      "535000\n",
      "540000\n",
      "545000\n",
      "550000\n",
      "555000\n",
      "560000\n",
      "565000\n",
      "570000\n",
      "575000\n",
      "580000\n",
      "585000\n",
      "590000\n",
      "595000\n",
      "600000\n",
      "605000\n",
      "610000\n",
      "615000\n",
      "620000\n",
      "625000\n",
      "630000\n",
      "635000\n",
      "640000\n",
      "645000\n",
      "650000\n",
      "655000\n",
      "660000\n",
      "665000\n",
      "670000\n",
      "675000\n",
      "680000\n",
      "685000\n",
      "690000\n",
      "695000\n",
      "700000\n",
      "705000\n",
      "710000\n",
      "715000\n",
      "720000\n",
      "725000\n",
      "730000\n",
      "735000\n",
      "740000\n",
      "745000\n",
      "750000\n",
      "755000\n",
      "760000\n",
      "765000\n",
      "770000\n",
      "775000\n",
      "780000\n",
      "785000\n",
      "790000\n",
      "795000\n",
      "800000\n",
      "805000\n",
      "810000\n",
      "815000\n",
      "820000\n",
      "825000\n",
      "830000\n",
      "835000\n",
      "840000\n",
      "845000\n",
      "850000\n",
      "855000\n",
      "860000\n",
      "865000\n",
      "870000\n",
      "875000\n",
      "880000\n",
      "885000\n",
      "890000\n",
      "895000\n",
      "900000\n",
      "905000\n",
      "910000\n",
      "915000\n",
      "920000\n",
      "925000\n",
      "930000\n",
      "935000\n",
      "940000\n",
      "945000\n",
      "950000\n",
      "955000\n",
      "(99427, 46)\n"
     ]
    }
   ],
   "source": [
    "from data_cleaner.data_preprocess import BPIC2018Preprocess\n",
    "\n",
    "df = BPIC2018Preprocess().preprocess()\n",
    "print(df.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "jupyter": {
     "source_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train_data_size:  1192 , test_data_size:  512\n",
      "1000 1039085224\n"
     ]
    }
   ],
   "source": [
    "def prepare_train_data(data_processed):\n",
    "    all_case_ids = set(data_processed['case:concept:name'].values)\n",
    "    num_trainset = len(all_case_ids) * 7 // 10\n",
    "    random.seed(0)\n",
    "    train_cids = random.sample(all_case_ids, num_trainset)\n",
    "    train_cids_set = set(train_cids)\n",
    "    test_cids_set = all_case_ids - train_cids_set\n",
    "    print('train_data_size: ', len(train_cids_set), ', test_data_size: ', len(test_cids_set))\n",
    "\n",
    "    bools = np.array(list(range(0, 2)))\n",
    "    bools = np.reshape(bools, (bools.shape[0], 1))\n",
    "    ohe_bools = OneHotEncoder(sparse=False)\n",
    "    ohe_bools.fit(bools)\n",
    "    \n",
    "    doc_types = np.array(data_processed['doctype'].values)\n",
    "    doc_types = np.reshape(doc_types, (doc_types.shape[0], 1))\n",
    "    ohe_dt = OneHotEncoder(sparse=False)\n",
    "    ohe_dt.fit(doc_types)\n",
    "    \n",
    "    sub_processes = np.array(data_processed['subprocess'].values)\n",
    "    sub_processes = np.reshape(sub_processes, (sub_processes.shape[0], 1))\n",
    "    ohe_sp = OneHotEncoder(sparse=False)\n",
    "    ohe_sp.fit(sub_processes)\n",
    "    \n",
    "    activity_names = np.array(data_processed['concept:name'].values)\n",
    "    activity_names = np.reshape(activity_names, (activity_names.shape[0], 1))\n",
    "    ohe_act = OneHotEncoder(sparse=False)\n",
    "    ohe_act.fit(activity_names)\n",
    "    \n",
    "    departments = np.array(data_processed['case:department'].values)\n",
    "    departments = np.reshape(departments, (departments.shape[0], 1))\n",
    "    ohe_dps = OneHotEncoder(sparse=False)\n",
    "    ohe_dps.fit(departments)\n",
    "    \n",
    "    weeks = np.array(list(range(0, 7)))\n",
    "    weeks = np.reshape(weeks, (weeks.shape[0], 1))\n",
    "    ohe_week = OneHotEncoder(sparse=False)\n",
    "    ohe_week.fit(weeks)\n",
    "\n",
    "    def generate_data(cid_set):\n",
    "        data_set = []\n",
    "        trace_length = 0\n",
    "        cnt = 0\n",
    "        for cid in cid_set:\n",
    "            cnt += 1\n",
    "            if cnt % 1000 == 0:\n",
    "                print(cnt, cid)\n",
    "            thisdf = data_processed[data_processed['case:concept:name'] == cid]\n",
    "            trace_length = max(trace_length, thisdf.shape[0])\n",
    "            tmpdata = []\n",
    "            \n",
    "            start_time = datetime.datetime.strptime(thisdf.iloc[0]['time:timestamp_short'], '%Y-%m-%d %H:%M:%S')\n",
    "            end_time = datetime.datetime.strptime(thisdf.iloc[-1]['time:timestamp_short'], '%Y-%m-%d %H:%M:%S')\n",
    "            last_time = start_time\n",
    "\n",
    "            for i in range(thisdf.shape[0]):\n",
    "                row = [int(thisdf.iloc[i]['case:concept:name'])]\n",
    "                \n",
    "                event_dt = datetime.datetime.strptime(thisdf.iloc[i]['time:timestamp_short'], '%Y-%m-%d %H:%M:%S')\n",
    "                midnight_time = event_dt.replace(hour=0, minute=0, second=0, microsecond=0)\n",
    "                \n",
    "                activity_name = np.array(thisdf.iloc[i]['concept:name'])\n",
    "                activity_name = np.reshape(activity_name, (-1, 1))\n",
    "                success_name = np.array(thisdf.iloc[i]['success'])\n",
    "                success_name = np.reshape(success_name, (-1, 1))\n",
    "                doc_type = np.array(thisdf.iloc[i]['doctype'])\n",
    "                doc_type = np.reshape(doc_type, (-1, 1))\n",
    "                subprocess_name = np.array(thisdf.iloc[i]['subprocess'])\n",
    "                subprocess_name = np.reshape(subprocess_name, (-1, 1))\n",
    "                young_farmer = np.array(thisdf.iloc[i]['case:young farmer'])\n",
    "                young_farmer = np.reshape(young_farmer, (-1, 1))\n",
    "                selected_random = np.array(thisdf.iloc[i]['case:selected_random'])\n",
    "                selected_random = np.reshape(selected_random, (-1, 1))\n",
    "                penalty_ajlp = np.array(thisdf.iloc[i]['case:penalty_AJLP'])\n",
    "                penalty_ajlp = np.reshape(penalty_ajlp, (-1, 1))\n",
    "                # penalty_bgkv = np.array(thisdf.iloc[i]['case:penalty_BGKV'])\n",
    "                # penalty_bgkv = np.reshape(penalty_bgkv, (-1, 1))\n",
    "                penalty_auvp = np.array(thisdf.iloc[i]['case:penalty_AUVP'])\n",
    "                penalty_auvp = np.reshape(penalty_auvp, (-1, 1))\n",
    "                small_farmer = np.array(thisdf.iloc[i]['case:small farmer'])\n",
    "                small_farmer = np.reshape(small_farmer, (-1, 1))\n",
    "                penalty_bgp = np.array(thisdf.iloc[i]['case:penalty_BGP'])\n",
    "                penalty_bgp = np.reshape(penalty_bgp, (-1, 1))\n",
    "                department_name = np.array(thisdf.iloc[i]['case:department'])\n",
    "                department_name = np.reshape(department_name, (-1, 1))\n",
    "                penalty_c16 = np.array(thisdf.iloc[i]['case:penalty_C16'])\n",
    "                penalty_c16 = np.reshape(penalty_c16, (-1, 1))\n",
    "                penalty_bgk = np.array(thisdf.iloc[i]['case:penalty_BGK'])\n",
    "                penalty_bgk = np.reshape(penalty_bgk, (-1, 1))\n",
    "                penalty_cc = np.array(thisdf.iloc[i]['case:penalty_CC'])\n",
    "                penalty_cc = np.reshape(penalty_cc, (-1, 1))\n",
    "                penalty_avjlp = np.array(thisdf.iloc[i]['case:penalty_AVJLP'])\n",
    "                penalty_avjlp = np.reshape(penalty_avjlp, (-1, 1))\n",
    "                penalty_c9 = np.array(thisdf.iloc[i]['case:penalty_C9'])\n",
    "                penalty_c9 = np.reshape(penalty_c9, (-1, 1))\n",
    "                rejected = np.array(thisdf.iloc[i]['case:rejected'])\n",
    "                rejected = np.reshape(rejected, (-1, 1))\n",
    "                penalty_c4 = np.array(thisdf.iloc[i]['case:penalty_C4'])\n",
    "                penalty_c4 = np.reshape(penalty_c4, (-1, 1))\n",
    "                penalty_abp = np.array(thisdf.iloc[i]['case:penalty_ABP'])\n",
    "                penalty_abp = np.reshape(penalty_abp, (-1, 1))\n",
    "                penalty_b6 = np.array(thisdf.iloc[i]['case:penalty_B6'])\n",
    "                penalty_b6 = np.reshape(penalty_b6, (-1, 1))\n",
    "                penalty_b4 = np.array(thisdf.iloc[i]['case:penalty_B4'])\n",
    "                penalty_b4 = np.reshape(penalty_b4, (-1, 1))\n",
    "                penalty_b5 = np.array(thisdf.iloc[i]['case:penalty_B5'])\n",
    "                penalty_b5 = np.reshape(penalty_b5, (-1, 1))\n",
    "                penalty_b2 = np.array(thisdf.iloc[i]['case:penalty_B2'])\n",
    "                penalty_b2 = np.reshape(penalty_b2, (-1, 1))\n",
    "                selected_risk = np.array(thisdf.iloc[i]['case:selected_risk'])\n",
    "                selected_risk = np.reshape(selected_risk, (-1, 1))\n",
    "                penalty_b3 = np.array(thisdf.iloc[i]['case:penalty_B3'])\n",
    "                penalty_b3 = np.reshape(penalty_b3, (-1, 1))\n",
    "                selected_manually = np.array(thisdf.iloc[i]['case:selected_manually'])\n",
    "                selected_manually = np.reshape(selected_manually, (-1, 1))\n",
    "                penalty_agp = np.array(thisdf.iloc[i]['case:penalty_AGP'])\n",
    "                penalty_agp = np.reshape(penalty_agp, (-1, 1))\n",
    "                penalty_b16 = np.array(thisdf.iloc[i]['case:penalty_B16'])\n",
    "                penalty_b16 = np.reshape(penalty_b16, (-1, 1))\n",
    "                penalty_gp1 = np.array(thisdf.iloc[i]['case:penalty_GP1'])\n",
    "                penalty_gp1 = np.reshape(penalty_gp1, (-1, 1))\n",
    "                # penalty_b5f = np.array(thisdf.iloc[i]['case:penalty_B5F'])\n",
    "                # penalty_b5f = np.reshape(penalty_b5f, (-1, 1))\n",
    "                # penalty_v5 = np.array(thisdf.iloc[i]['case:penalty_V5'])\n",
    "                # penalty_v5 = np.reshape(penalty_v5, (-1, 1))\n",
    "                redistribution_name = np.array(thisdf.iloc[i]['case:redistribution'])\n",
    "                redistribution_name = np.reshape(redistribution_name, (-1, 1))\n",
    "                penalty_jpl6 = np.array(thisdf.iloc[i]['case:penalty_JLP6'])\n",
    "                penalty_jpl6 = np.reshape(penalty_jpl6, (-1, 1))\n",
    "                penalty_jpl5 = np.array(thisdf.iloc[i]['case:penalty_JLP5'])\n",
    "                penalty_jpl5 = np.reshape(penalty_jpl5, (-1, 1))\n",
    "                penalty_jpl2 = np.array(thisdf.iloc[i]['case:penalty_JLP2'])\n",
    "                penalty_jpl2 = np.reshape(penalty_jpl2, (-1, 1))\n",
    "                # penalty_jpl3 = np.array(thisdf.iloc[i]['case:penalty_JLP3'])\n",
    "                # penalty_jpl3 = np.reshape(penalty_jpl3, (-1, 1))\n",
    "                penalty_jlp1 = np.array(thisdf.iloc[i]['case:penalty_JLP1'])\n",
    "                penalty_jlp1 = np.reshape(penalty_jlp1, (-1, 1))\n",
    "                \n",
    "                weekday = event_dt.weekday()\n",
    "                weekday = np.reshape(np.array([weekday]), (-1, 1))\n",
    "                \n",
    "                row.extend(ohe_act.transform(activity_name).tolist()[0])  # 活动名称one-hot编码\n",
    "                row.extend(ohe_week.transform(weekday).tolist()[0])\n",
    "                row.extend(ohe_bools.transform(success_name).tolist()[0])\n",
    "                row.extend(ohe_dt.transform(doc_type).tolist()[0])\n",
    "                row.extend(ohe_sp.transform(subprocess_name).tolist()[0])\n",
    "                row.extend(ohe_bools.transform(young_farmer).tolist()[0])\n",
    "                row.extend(ohe_bools.transform(selected_random).tolist()[0])\n",
    "                row.extend(ohe_bools.transform(penalty_ajlp).tolist()[0])\n",
    "                # row.extend(ohe_bools.transform(penalty_bgkv).tolist()[0])\n",
    "                row.extend(ohe_bools.transform(penalty_auvp).tolist()[0])\n",
    "                row.extend(ohe_bools.transform(small_farmer).tolist()[0])\n",
    "                row.extend(ohe_bools.transform(penalty_bgp).tolist()[0])\n",
    "                row.extend(ohe_dps.transform(department_name).tolist()[0])\n",
    "                row.extend(ohe_bools.transform(penalty_c16).tolist()[0])\n",
    "                row.extend(ohe_bools.transform(penalty_bgk).tolist()[0])\n",
    "                row.extend(ohe_bools.transform(penalty_cc).tolist()[0])\n",
    "                row.extend(ohe_bools.transform(penalty_avjlp).tolist()[0])\n",
    "                row.extend(ohe_bools.transform(penalty_c9).tolist()[0])\n",
    "                row.extend(ohe_bools.transform(rejected).tolist()[0])\n",
    "                row.extend(ohe_bools.transform(penalty_c4).tolist()[0])\n",
    "                row.extend(ohe_bools.transform(penalty_abp).tolist()[0])\n",
    "                row.extend(ohe_bools.transform(penalty_b6).tolist()[0])\n",
    "                row.extend(ohe_bools.transform(penalty_b4).tolist()[0])\n",
    "                row.extend(ohe_bools.transform(penalty_b5).tolist()[0])\n",
    "                row.extend(ohe_bools.transform(penalty_b2).tolist()[0])\n",
    "                row.extend(ohe_bools.transform(selected_risk).tolist()[0])\n",
    "                row.extend(ohe_bools.transform(penalty_b3).tolist()[0])\n",
    "                row.extend(ohe_bools.transform(selected_manually).tolist()[0])\n",
    "                row.extend(ohe_bools.transform(penalty_agp).tolist()[0])\n",
    "                row.extend(ohe_bools.transform(penalty_b16).tolist()[0])\n",
    "                row.extend(ohe_bools.transform(penalty_gp1).tolist()[0])\n",
    "                # row.extend(ohe_bools.transform(penalty_b5f).tolist()[0])\n",
    "                # row.extend(ohe_bools.transform(penalty_v5).tolist()[0])\n",
    "                row.extend(ohe_bools.transform(redistribution_name).tolist()[0])\n",
    "                row.extend(ohe_bools.transform(penalty_jpl6).tolist()[0])\n",
    "                row.extend(ohe_bools.transform(penalty_jpl5).tolist()[0])\n",
    "                row.extend(ohe_bools.transform(penalty_jpl2).tolist()[0])\n",
    "                # row.extend(ohe_bools.transform(penalty_jpl3).tolist()[0])\n",
    "                row.extend(ohe_bools.transform(penalty_jlp1).tolist()[0])\n",
    "                \n",
    "                row.append(thisdf.iloc[i][\"case:penalty_amount0\"])\n",
    "                row.append(thisdf.iloc[i][\"case:cross_compliance\"])\n",
    "                row.append(thisdf.iloc[i][\"case:area\"])\n",
    "                row.append(thisdf.iloc[i][\"case:payment_actual0\"])\n",
    "                row.append(thisdf.iloc[i][\"case:amount_applied0\"])\n",
    "                row.append(thisdf.iloc[i][\"case:number_parcels\"])\n",
    "                \n",
    "                row.append((event_dt - start_time).total_seconds()/3600/24)  # 总花费时间\n",
    "                row.append((event_dt - last_time).total_seconds()/3600/24)  # 相比上次活动花费时间\n",
    "                last_time = event_dt\n",
    "                row.append((event_dt - midnight_time).total_seconds()/3600/24)  # 距午夜时间\n",
    "                \n",
    "                row.append((end_time - event_dt).total_seconds()/3600/24)\n",
    "\n",
    "                tmpdata.append(row)\n",
    "                if i != thisdf.shape[0] - 1:\n",
    "                    data_set.append(tmpdata.copy())\n",
    "        return data_set, trace_length\n",
    "\n",
    "    train_data_set, max_train_trace_length = generate_data(train_cids_set)\n",
    "    test_data_set, max_test_trace_length = generate_data(test_cids_set)\n",
    "\n",
    "    min_value = [1e20] * (len(train_data_set[0][0]) - 1)\n",
    "    max_value = [-1] * (len(train_data_set[0][0]) - 1)\n",
    "    for element in train_data_set:\n",
    "        for row in element:\n",
    "            for i in range(1, len(row)-1):\n",
    "                min_value[i] = min(min_value[i], row[i])\n",
    "                max_value[i] = max(max_value[i], row[i])\n",
    "    train_data_new = []\n",
    "    for i in range(len(train_data_set)):\n",
    "        seq = []\n",
    "        for j in range(len(train_data_set[i])):\n",
    "            row = [train_data_set[i][j][0]]\n",
    "            for k in range(1, len(train_data_set[i][j])-1):\n",
    "                if max_value[k] == min_value[k]:\n",
    "                    row.append(train_data_set[i][j][k])\n",
    "                else:\n",
    "                    row.append((train_data_set[i][j][k]-min_value[k]) / (max_value[k]-min_value[k]))\n",
    "            row.append(train_data_set[i][j][-1])\n",
    "            seq.append(row)\n",
    "        train_data_new.append(seq)\n",
    "    test_data_new = []\n",
    "    for i in range(len(test_data_set)):\n",
    "        seq = []\n",
    "        for j in range(len(test_data_set[i])):\n",
    "            row = [test_data_set[i][j][0]]\n",
    "            for k in range(1, len(test_data_set[i][j])-1):\n",
    "                if max_value[k] == min_value[k]:\n",
    "                    row.append(test_data_set[i][j][k])\n",
    "                else:\n",
    "                    row.append((test_data_set[i][j][k]-min_value[k]) / (max_value[k]-min_value[k]))\n",
    "            row.append(test_data_set[i][j][-1])\n",
    "            seq.append(row)\n",
    "        test_data_new.append(seq)\n",
    "\n",
    "    train_data_set = sequence.pad_sequences(train_data_new, maxlen=20, dtype='float64')\n",
    "    test_data_set = sequence.pad_sequences(test_data_new, maxlen=20, dtype='float64')\n",
    "    return train_data_set, test_data_set\n",
    "\n",
    "train_data_set, test_data_set = prepare_train_data(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.save(\"../data/filtered_data/BPIC2018/train_data.npy\", train_data_set)\n",
    "np.save(\"../data/filtered_data/BPIC2018/test_data.npy\", test_data_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv(\"../data/filtered_data/BPIC2018/filtered_data_v1.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   success              doctype   subprocess    concept:name  \\\n",
      "0     True  Payment application  Application      mail valid   \n",
      "1     True  Payment application  Application     mail income   \n",
      "2     True  Geo parcel document         Main      initialize   \n",
      "3     True  Geo parcel document     Declared   begin editing   \n",
      "4     True  Geo parcel document     Declared  finish editing   \n",
      "\n",
      "  time:timestamp_short  case:young farmer  case:selected_random  \\\n",
      "0  2016-05-17 00:00:00              False                 False   \n",
      "1  2016-05-18 00:00:00              False                 False   \n",
      "2  2016-06-22 01:10:12              False                 False   \n",
      "3  2016-06-22 01:10:12              False                 False   \n",
      "4  2016-06-22 01:10:15              False                 False   \n",
      "\n",
      "   case:penalty_AJLP  case:penalty_amount0  case:penalty_BGKV  ...  \\\n",
      "0              False                   0.0              False  ...   \n",
      "1              False                   0.0              False  ...   \n",
      "2              False                   0.0              False  ...   \n",
      "3              False                   0.0              False  ...   \n",
      "4              False                   0.0              False  ...   \n",
      "\n",
      "   case:penalty_JLP6  case:penalty_JLP5  case:penalty_JLP2 case:penalty_JLP3  \\\n",
      "0              False              False              False             False   \n",
      "1              False              False              False             False   \n",
      "2              False              False              False             False   \n",
      "3              False              False              False             False   \n",
      "4              False              False              False             False   \n",
      "\n",
      "   case:number_parcels  case:penalty_JLP1  case:concept:name  time_spent  \\\n",
      "0                   38              False         -782991358    0.000000   \n",
      "1                   38              False         -782991358    1.000000   \n",
      "2                   38              False         -782991358   36.048750   \n",
      "3                   38              False         -782991358   36.048750   \n",
      "4                   38              False         -782991358   36.048785   \n",
      "\n",
      "   total_time_pred  total_time_true  \n",
      "0       258.732269       234.628345  \n",
      "1       260.099609       234.628345  \n",
      "2       261.696165       234.628345  \n",
      "3       254.015379       234.628345  \n",
      "4       253.477526       234.628345  \n",
      "\n",
      "[5 rows x 49 columns]\n"
     ]
    }
   ],
   "source": [
    "data_path = \"../data/drl_data/bpic2018/bpic2018_wp_nn.csv\"\n",
    "\n",
    "df = pd.read_csv(data_path)\n",
    "print(df.head(5))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### case 1: 抽样case id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1704 1704\n",
      "50 75 213.6357523148148 1010.5826851851853\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAsYAAAF1CAYAAADr3izzAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAhkklEQVR4nO3dfbRddX3n8ffHBEEUK5QLE4F40VJW1VWDvYuxpbW0qCBYg12DA602TmljO3VGWx0N2uVDV+2krQ+drlZtFMa0VYSKFCq0Y5pqqVOfAgIGAwIaIRCTCCj4MFTgO3+cfdvj7T0359zzdM/N+7XWXWfv39nn7O+++2TvT373d/ZOVSFJkiQd6B417gIkSZKkpcBgLEmSJGEwliRJkgCDsSRJkgQYjCVJkiTAYCxJkiQBBmMtE0l2JnnOGNY7naSSrBz1uiVJ80vy/iS/O+46NHkMxlq0cYXRcToQt1mSwOOfDgwGYw2NvaiSdODwmK/lwGCsRUnyF8Bq4G+SfCvJa9uGFZyf5A7gH5pl/yrJ15J8M8k1SZ7W9j6PSfL2JF9tnv9kksc0zz0ryT8n+UaSG5Kc2mVtj0qyIcntSe5JcmmSI5rnZmtcl+SOJF9P8oY59WxOcl+SHc127eq0zW2r/cX53k+SloNuj/lJTp09Zra99l97mhc6Ps+zzh1JXtA2v7I5xj6zme94bpnzPi9L8sk5bZXkh5rpg5O8rTmG70nynrbz0JFJPtqch+5N8k9JzE7LmDtXi1JVLwXuAH6uqh5XVX/Q9vRPAz8CnN7M/y1wAnAUcB3wgbZl3wb8GPATwBHAa4FHkhwDXAX8btP+GuCyJFNdlPffgbObOp4I3Af86ZxlfhI4ETgNeGOSH2na3wRMA08Gngu8pMtt7vR+kjTxejzmL6Sb4/Osi4Hz2uZPB75eVdc18wudW3rx+8APA2uAHwKOAd7YPPdqYBcwBRwNvB6oRa5HE8BgrGF4c1V9u6q+C1BVF1XVA1X1IPBm4BlJfqD5X/cvA6+sqruq6uGq+udmuZcAV1fV1VX1SFVtAbYBZ3ax/pcDb6iqXW3r/E9z/sz3lqr6blXdANwAPKNpfzHwe1V1X1XtAv64y23u9H6StNx93zF/P7o5Ps/6IPDCJIc287/QtAGdzy29FJ4kwK8Cv1lV91bVA8DvAec2i3wPWAU8qaq+V1X/VFUG42XM8UAahjtnJ5KsAN4KnEPrf9yPNE8dCRwMHALcPs97PAk4J8nPtbUdBHy8i/U/Cbg8ySNtbQ/T+t/+rK+1TX8HeFwz/cT2+udML6TT+0nSctftcRIWPj7f1b5gVd2WZAfwc0n+BnghcBLs99zyzR7qmQIOBa5tZWQAAqxopv+QVuj+WPP8pqra2MP7a8IYjNWPTv9rbm//BWAt8BxgJ/ADtP50FuDrwP8DnkKrl7XdncBfVNWvLqKuO4Ffrqr/O/eJJNP7ee1u4Fjgi838cXOet6dA0oGqm2P+t2kFTeBfA2z7ELiOx+cOZodTPAr4YlXd1rQvdG6Za25N/6Htua8D3wWeVlV3zX1h04P8auDVzRjmjyf5XFVt7bJ+TRiHUqgfe2iNxV3IYcCDwD20Dky/N/tEVT0CXAS8I8kTk6xI8uNJDgb+klYvwelN+yHNlzqO7aKu9wBvTfIkgCRTSdZ2uU2XAhckObwZ5/yKOc93s82StBx1c/z7EnBIkrOSHAT8Nq2/Ds7q9fj8IeB5wK/TNoyCBc4t87gBeFqSNUkOodUDDPzreei9wDuTHNXUdEyS05vpFyT5oWbIxf20ercf3s/vQBPMYKx+/E/gt5tv676mwzJ/DnyV1p/Ivgh8es7zrwG+AHwOuJfWlyAeVVV30uoNeD2wj1Yvw/+gu8/s/wKupPWnrweadf7HLrfpd2h90eIrwN8DH6Z18J3VzTZL0nK03+NfVX0T+K/A+2gd979N65g6q6fjc1XtBj5F6wval7Q9tb9zS/t7fInWsf3vgVuBT85Z5HXAbcCnk9zfLHdi89wJzfy3mjreVVWf6LQuTb44hlzqLMmvA+dW1U+PuxZJkjRc9hhLbZKsSnJKc63NE2mNLbt83HVJkqTh88t30vd7NPBnwPHAN2iNb3vXOAuSJEmj4VAKSZIkCYdSSJIkSYDBWJIkSQKWyBjjI488sqanp8ddhqQD3LXXXvv1qpra/5JaDI/1kpaChY71SyIYT09Ps23btnGXIekAl+Sr465hOfNYL2kpWOhY71AKSZIkCYOxJEmSBBiMJUmSJGCJjDGWJC1dSXYCDwAPAw9V1UySI4BLgGlgJ/DiqrpvXDVK0iDYYyxJ6sbPVNWaqppp5jcAW6vqBGBrMy9JE81gLElajLXA5mZ6M3D2+EqRpMEwGEuS9qeAjyW5Nsn6pu3oqtoN0DweNbbqJGlAHGMsSdqfU6rq7iRHAVuS3NztC5sgvR5g9erVw6pPkgbCHmNJ0oKq6u7mcS9wOXAysCfJKoDmcW+H126qqpmqmpma8qaCkpY2g7EkqaMkj01y2Ow08DxgO3AlsK5ZbB1wxXgqlKTBcSiFJGkhRwOXJ4HWOeODVfV3ST4HXJrkfOAO4Jwx1ihJA2EwliR1VFVfBp4xT/s9wGmjr0iShsehFJIkSRL2GA/V9Iarulpu58azhlyJJEmS9sceY0mSJAmDsSRJkgQYjCVJkiSgi2Cc5KIke5Nsb2u7JMn1zc/OJNc37dNJvtv23HuGWLskSZI0MN18+e79wJ8Afz7bUFX/eXY6yduBb7Ytf3tVrRlQfZIkSdJI7DcYV9U1Sabney6tK76/GPjZAdclSZIkjVS/Y4x/CthTVbe2tR2f5PNJ/jHJT3V6YZL1SbYl2bZv374+y5AkSZL6028wPg+4uG1+N7C6qk4Cfgv4YJLHz/fCqtpUVTNVNTM1NdVnGZIkSVJ/Fh2Mk6wEfh64ZLatqh5sbhNKVV0L3A78cL9FSpIkScPWT4/xc4Cbq2rXbEOSqSQrmuknAycAX+6vREmSJGn4urlc28XAp4ATk+xKcn7z1Ll8/zAKgGcDNya5Afgw8GtVde8gC5YkSZKGoZurUpzXof1l87RdBlzWf1mSJEnSaHnnO0mSJAmDsSRJkgQYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkiRJgMFYkiRJAgzGkiRJEmAwliRJkgCDsSRJkgQYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkiRJgMFYkiRJAgzGkiRJEmAwliRJkgCDsSRJkgQYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkiRJgMFYkiRJAgzGkiRJEmAwliRJkgCDsSRJkgQYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkiRJgMFYkiRJAgzGkiRJEgArx13AUjK94aqultu58awhVyJJkqRRs8dYkiRJootgnOSiJHuTbG9re3OSu5Jc3/yc2fbcBUluS3JLktOHVbgkSZI0SN30GL8fOGOe9ndW1Zrm52qAJE8FzgWe1rzmXUlWDKpYSZIkaVj2G4yr6hrg3i7fby3woap6sKq+AtwGnNxHfZIkSdJI9DPG+BVJbmyGWhzetB0D3Nm2zK6m7d9Jsj7JtiTb9u3b10cZkiRJUv8WG4zfDTwFWAPsBt7etGeeZWu+N6iqTVU1U1UzU1NTiyxDkiRJGoxFBeOq2lNVD1fVI8B7+bfhEruA49oWPRa4u78SJUmSpOFbVDBOsqpt9kXA7BUrrgTOTXJwkuOBE4DP9leiJEmSNHz7vcFHkouBU4Ejk+wC3gScmmQNrWESO4GXA1TVTUkuBb4IPAT8RlU9PJTKJUkj0VxdaBtwV1W9IMkRwCXANK1zwIur6r7xVShJg7HfYFxV583TfOECy78VeGs/RUmSlpRXAjuAxzfzG4CtVbUxyYZm/nXjKk6SBsU730mSOkpyLHAW8L625rXA5mZ6M3D2iMuSpKEwGEuSFvJHwGuBR9rajq6q3QDN41GdXuylOSVNEoOxJGleSV4A7K2qaxf7Hl6aU9Ik2e8YY0nSAesU4IVJzgQOAR6f5C+BPUlWVdXu5ipFe8dapSQNiD3GkqR5VdUFVXVsVU0D5wL/UFUvoXVpznXNYuuAK8ZUoiQNlMFYktSrjcBzk9wKPLeZl6SJ51AKSdJ+VdUngE800/cAp42zHkkaBnuMJUmSJAzGkiRJEmAwliRJkgCDsSRJkgQYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkiRJgMFYkiRJAgzGkiRJEmAwliRJkgCDsSRJkgQYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkiRJgMFYkiRJAgzGkiRJEmAwliRJkgCDsSRJkgQYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkiRJgMFYkiRJAgzGkiRJEmAwliRJkoAugnGSi5LsTbK9re0Pk9yc5MYklyd5QtM+neS7Sa5vft4zxNolSZKkgemmx/j9wBlz2rYAT6+qHwW+BFzQ9tztVbWm+fm1wZQpSZIkDdd+g3FVXQPcO6ftY1X1UDP7aeDYIdQmSZIkjcwgxhj/MvC3bfPHJ/l8kn9M8lMDeH9JkiRp6Fb28+IkbwAeAj7QNO0GVlfVPUl+DPjrJE+rqvvnee16YD3A6tWr+ylDkiRJ6tuie4yTrANeAPxiVRVAVT1YVfc009cCtwM/PN/rq2pTVc1U1czU1NRiy5AkSZIGYlE9xknOAF4H/HRVfaetfQq4t6oeTvJk4ATgywOpdB7TG67qarmdG88aVgmSJElaJvYbjJNcDJwKHJlkF/AmWlehOBjYkgTg080VKJ4N/E6Sh4CHgV+rqnvnfWNJkiRpCdlvMK6q8+ZpvrDDspcBl/VblCRJkjRq3vlOkiRJwmAsSZIkAQZjSZIkCTAYS5IkSYDBWJIkSQIMxpIkSRJgMJYkSZIAg7EkSZIEGIwlSZIkwGAsSZIkAQZjSZIkCTAYS5IkSYDBWJIkSQIMxpIkSRIAK8ddwChMb7hq3CVIkiRpibPHWJIkScJgLEmSJAEHyFCKpa6XoR47N541xEokSZIOXPYYS5I6SnJIks8muSHJTUne0rQfkWRLklubx8PHXask9ctgLElayIPAz1bVM4A1wBlJngVsALZW1QnA1mZekiaawViS1FG1fKuZPaj5KWAtsLlp3wycPfrqJGmwDMaSpAUlWZHkemAvsKWqPgMcXVW7AZrHo8ZYoiQNhMFYkrSgqnq4qtYAxwInJ3l6t69Nsj7JtiTb9u3bN7QaJWkQDMaSpK5U1TeATwBnAHuSrAJoHvd2eM2mqpqpqpmpqalRlSpJi2IwliR1lGQqyROa6ccAzwFuBq4E1jWLrQOuGEuBkjRAXsdYkrSQVcDmJCtodaZcWlUfTfIp4NIk5wN3AOeMs0hJGgSDsSSpo6q6EThpnvZ7gNNGX5EkDY9DKSRJkiQMxpIkSRJgMJYkSZIAg7EkSZIEGIwlSZIkwGAsSZIkAQZjSZIkCTAYS5IkSUAXwTjJRUn2Jtne1nZEki1Jbm0eD2977oIktyW5JcnpwypckiRJGqRueozfD5wxp20DsLWqTgC2NvMkeSpwLvC05jXvam4jKkmSJC1p+w3GVXUNcO+c5rXA5mZ6M3B2W/uHqurBqvoKcBtw8mBKlSRJkoZnsWOMj66q3QDN41FN+zHAnW3L7Wra/p0k65NsS7Jt3759iyxDkiRJGoxBf/ku87TVfAtW1aaqmqmqmampqQGXIUmSJPVmscF4T5JVAM3j3qZ9F3Bc23LHAncvvjxJkiRpNBYbjK8E1jXT64Ar2trPTXJwkuOBE4DP9leiJEmSNHwr97dAkouBU4Ejk+wC3gRsBC5Ncj5wB3AOQFXdlORS4IvAQ8BvVNXDQ6p9bKY3XDXuEiRJkjRg+w3GVXVeh6dO67D8W4G39lOUJEmSNGre+U6SJEnCYCxJkiQBBmNJkiQJMBhLkg5Afola0nwMxpIkSRIGY0mSJAkwGEuSJEmAwViSJEkCDMaSJEkSYDCWJEmSAIOxJEmSBBiMJUmSJMBgLEmSJAEGY0mSJAkwGEuSJEmAwViSJEkCDMaSJEkSYDCWJEmSAIOxJEmSBBiMJUmSJMBgLEmSJAEGY0mSJAkwGEuSJEmAwViSJEkCDMaSJEkSYDCWJEmSAIOxJEmSBMDKcRegyTC94aqultu58awhVyJJkjQc9hhLkiRJGIwlSZIkwGAsSZIkAY4xnjiO9ZUkSRoOe4wlSZIkDMaSJEkS0MdQiiQnApe0NT0ZeCPwBOBXgX1N++ur6urFrkeSJEkahUUH46q6BVgDkGQFcBdwOfBfgHdW1dsGUaAkSZI0CoMaSnEacHtVfXVA7ydJkiSN1KCC8bnAxW3zr0hyY5KLkhw+oHVIkiRJQ9N3ME7yaOCFwF81Te8GnkJrmMVu4O0dXrc+ybYk2/bt2zffIpKkMUtyXJKPJ9mR5KYkr2zaj0iyJcmtzaOdIJIm3iCuY/x84Lqq2gMw+wiQ5L3AR+d7UVVtAjYBzMzM1ADq0CJ0e11kSQesh4BXV9V1SQ4Drk2yBXgZsLWqNibZAGwAXjfGOiWpb4MYSnEebcMokqxqe+5FwPYBrEOSNAZVtbuqrmumHwB2AMcAa4HNzWKbgbPHUqAkDVBfPcZJDgWeC7y8rfkPkqwBCtg55zktc730QHt3PmmyJJkGTgI+AxxdVbuhFZ6THNXhNeuB9QCrV68eUaWStDh9BeOq+g7wg3PaXtpXRZKkJSfJ44DLgFdV1f1Junqdw+YkTZJBjDHWEuTYYUmDkuQgWqH4A1X1kaZ5T5JVTW/xKmDv+CqUpMHwltCSpI7S6hq+ENhRVe9oe+pKYF0zvQ64YtS1SdKg2WMsSVrIKcBLgS8kub5pez2wEbg0yfnAHcA54ylPkgbHYCxJ6qiqPgl0GlB82ihrkaRhcyiFJEmShMFYkiRJAgzGkiRJEmAwliRJkgCDsSRJkgQYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkiRJgMFYkiRJAgzGkiRJEmAwliRJkgCDsSRJkgQYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkiRJgMFYkiRJAgzGkiRJEmAwliRJkgCDsSRJkgQYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkiRJAKwcdwE6cE1vuKqr5XZuPGvIlUiSJNljLEmSJAEGY0mSJAkwGEuSJEmAwViSJEkCDMaSJEkS0OdVKZLsBB4AHgYeqqqZJEcAlwDTwE7gxVV1X39lSpIkScM1iB7jn6mqNVU108xvALZW1QnA1mZekiRJWtKGMZRiLbC5md4MnD2EdUiSJEkD1e8NPgr4WJIC/qyqNgFHV9VugKraneSo+V6YZD2wHmD16tV9lqHlzBuBSJKkUeg3GJ9SVXc34XdLkpu7fWETojcBzMzMVJ91SJIkSX3pKxhX1d3N494klwMnA3uSrGp6i1cBewdQp7Qs2PstSdLStegxxkkem+Sw2WngecB24EpgXbPYOuCKfouUJEmShq2fHuOjgcuTzL7PB6vq75J8Drg0yfnAHcA5/ZcpSZIkDdeig3FVfRl4xjzt9wCn9VOUdKBzyIUkSaPnne8kSZIk+r8qhbRk2MsqSZL6YY+xJEmShD3GOgDZsyxJkuZjj7EkSZKEwViSJEkCHEoh9a3boRmSJGlps8dYkiRJwmAsSZIkAQZjSZIkCTAYS5IkSYDBWJIkSQIMxpIkSRJgMJYkSZIAg7EkaQFJLkqyN8n2trYjkmxJcmvzePg4a5SkQTEYS5IW8n7gjDltG4CtVXUCsLWZl6SJZzCWJHVUVdcA985pXgtsbqY3A2ePsiZJGhaDsSSpV0dX1W6A5vGoTgsmWZ9kW5Jt+/btG1mB3fKW7pLaGYwlSUNTVZuqaqaqZqampsZdjiQtyGAsSerVniSrAJrHvWOuR5IGwmAsSerVlcC6ZnodcMUYa5GkgTEYS5I6SnIx8CngxCS7kpwPbASem+RW4LnNvCRNvJXjLkCStHRV1XkdnjptpIVI0gjYYyxJkiRhMJYkSZIAh1JIGrJurxO7c+NZQ65EkqSF2WMsSZIkYY+x1JF3xJIk6cBij7EkSZKEPcbSAcFxvpIk7Z/BWNJEMeRr2KY3XOXnRzpAOZRCkiRJwmAsSZIkAQZjSZIkCegjGCc5LsnHk+xIclOSVzbtb05yV5Lrm58zB1euJEmSNBz9fPnuIeDVVXVdksOAa5NsaZ57Z1W9rf/yJEmSpNFYdI9xVe2uquua6QeAHcAxgypMkqRRmL3SiTf1kTSQy7UlmQZOAj4DnAK8IskvAdto9SrfN89r1gPrAVavXj2IMqQDjidySZIGp+8v3yV5HHAZ8Kqquh94N/AUYA2wG3j7fK+rqk1VNVNVM1NTU/2WIUmSJPWlr2Cc5CBaofgDVfURgKraU1UPV9UjwHuBk/svU5IkSRqufq5KEeBCYEdVvaOtfVXbYi8Cti++PEmSJGk0+hljfArwUuALSa5v2l4PnJdkDVDATuDlfaxD0gj1MmbZW+ZKkpabRQfjqvokkHmeunrx5UiSJEnjMZCrUkhSv7zChiRp3LwltCRJkoTBWJIkSQIMxpIkSRJgMJYkSZIAg7EkSZIEGIwlSfKqKBPAfaRR8HJtkhbFk5Qkabmxx1iSJEnCYCxJkiQBBmNJkiQJMBhLkiRJgMFYkrQMdfPl0G6/QOoXTf/NUv1dTG+4qq/alup2afQMxpIkSRIGY0mSJAkwGEuSJEmAwViSJEkCDMaSJEkSYDCWJEmSAIOxJEnz8nJuS9fs79zfvQbNYCxJkiRhMJYkSZIAg7EkSZIEGIwlSZIkwGAsSZIkAQZjSZL61n51hEFcKWG+91js+x5oV26Ye8UKr2ChXhiMJUmSJAzGkiRJEmAwliRJkgCDsSRJkgQYjCVJkiTAYCxJOkBMb7iq6ysTdFpuoSsd9PL+vayzm+V6XW+n+rtZ59zlOtXTS00L1bO/Grutu9tle73CyKRc7aLfz+eBwmAsSZIkYTCWJEmSgCEG4yRnJLklyW1JNgxrPZKk8fA4L2m5GUowTrIC+FPg+cBTgfOSPHUY65IkjZ7HeUnL0bB6jE8GbquqL1fVvwAfAtYOaV2SpNHzOC9p2RlWMD4GuLNtflfTJklaHjzOS1p2UlWDf9PkHOD0qvqVZv6lwMlV9d/allkPrG9mTwRuWcSqjgS+3me5S8ly2x5wmyaF29TypKqaGkYxy003x/mmvZdj/VL5HC6FOqxh6dQAS6MOaxhcDR2P9Sv7fONOdgHHtc0fC9zdvkBVbQI29bOSJNuqaqaf91hKltv2gNs0KdwmLcJ+j/PQ27F+qeyzpVCHNSydGpZKHdYwmhqGNZTic8AJSY5P8mjgXODKIa1LkjR6HuclLTtD6TGuqoeSvAL4P8AK4KKqumkY65IkjZ7HeUnL0bCGUlBVVwNXD+v9G30NxViCltv2gNs0Kdwm9WwIx/mlss+WQh3W0LIUaoClUYc1tAy1hqF8+U6SJEmaNN4SWpIkSWKCgnGSnUm+kOT6JNuatiOSbElya/N4+Ljr7EWHbXpzkruatuuTnDnuOnuR5AlJPpzk5iQ7kvz4MthP823TxO6nJCe21X19kvuTvGqS99MC2zSx+2k5SnJcko83/45uSvLKpr3jZy/JBc0tp29JcvoAajgkyWeT3NDU8JZR19D2viuSfD7JR8dRQ6/n1SH+Hno6bwzhM9HzMXFI++M3m8/k9iQXN5/VUdfwymb9NyV5VdM29BqSXJRkb5LtbW09rzfJjzWf6duS/HGS9FxMVU3ED7ATOHJO2x8AG5rpDcDvj7vOAWzTm4HXjLu2PrZpM/ArzfSjgScsg/003zZN9H5q27YVwNeAJ036fuqwTctiPy2XH2AV8Mxm+jDgS7RuJz3vZ6957gbgYOB44HZgRZ81BHhcM30Q8BngWaOsoa2W3wI+CHy0mR9pDb2cV4f8e+j6vDHMOpr33+8xcUify2OArwCPaeYvBV424hqeDmwHDqX1HbS/B04YRQ3As4FnAtv7+SwCnwV+nNa/878Fnt9rLRPTY9zBWlr/oGgezx5fKUryeFof7gsBqupfquobTPB+WmCblovTgNur6qtM8H6ao32btIRU1e6quq6ZfgDYQSsQdPrsrQU+VFUPVtVXgNto3Yq6nxqqqr7VzB7U/NQoawBIcixwFvC+tuaR1tDBqH8PvZ43hv276OaYOKwaVgKPSbKSVji9e8Q1/Ajw6ar6TlU9BPwj8KJR1FBV1wD3zmnuab1JVgGPr6pPVSsl/zmLOI9NUjAu4GNJrk3rTkoAR1fVbmgdcIGjxlbd4sy3TQCvSHJj86eFiflzNvBkYB/wv5s/D74vyWOZ7P3UaZtgcvdTu3OBi5vpSd5P7dq3CZbHflp2kkwDJ9Hqse302RvKbaebIQzXA3uBLVU18hqAPwJeCzzS1jbqGno5rw6rhl7PG8O+FXk3x8SB11BVdwFvA+4AdgPfrKqPjbIGWr3Fz07yg0kOBc6kdROfce2LXtd7TDPdVz2TFIxPqapnAs8HfiPJs8dd0ADMt03vBp4CrKH1j+Pt4yuvZytp/Snk3VV1EvBtWn/+mGSdtmmS9xMAad2U4YXAX427lkGZZ5smfj8tR0keB1wGvKqq7l9o0Xna+r6UUlU9XFVraN2t7+QkTx9lDUleAOytqmu7fcmga2j0cl4dVg29njeGVUcvx8RhfCYOp9UTejzwROCxSV4yyhqqagfw+8AW4O9oDVd4aJQ1dKnTegdSz8QE46q6u3ncC1xOq7t+T9N1TvO4d3wV9m6+baqqPc1B+xHgvQzvz2XDsAvY1fS+AHyY1gFvkvfTvNs04ftp1vOB66pqTzM/yftp1vdt0zLZT8tKkoNoheIPVNVHmuZOn72ubju9WM2f7D8BnDHiGk4BXphkJ/Ah4GeT/OWIa+j1vDqsfdHreWOYn4luj4nDqOE5wFeqal9VfQ/4CPATI66Bqrqwqp5ZVc+mNbTh1lHX0KbX9e5qpvuqZyKCcZLHJjlsdhp4Hq0u/yuBdc1i64ArxlNh7zpt0+yHoPEiWts5Earqa8CdSU5smk4DvsgE76dO2zTJ+6nNeXz/kIOJ3U9tvm+blsl+Wjaab4hfCOyoqne0PdXps3clcG6Sg5McT+uLQJ/ts4apJE9oph9DK5DcPMoaquqCqjq2qqZp/en+H6rqJaOsYRHn1YHXAIs6bwyljka3x8Rh1HAH8Kwkhzb/Tk6jNQZ/pL+HJEc1j6uBn6f1+xjHvph9/67X2wy3eCDJs5rf4S+xmPNYL9/UG9cPrTFINzQ/NwFvaNp/ENhK6380W4Ejxl3rALbpL4AvADc2O3/VuGvtcbvWANua+v8aOHyS99MC2zTp++lQ4B7gB9raJn0/zbdNE72fltsP8JO0/rR5I3B983PmQp894A20vnV+C4v4hvk8Nfwo8Pmmhu3AG5v2kdUwp55T+berUozy99DzeXVYv4dezxvDqKPXY+KQangLrf+kbW+OXQePoYZ/ovUfkxuA00b1e6AVwHcD36PV83v+YtYLzDS/v9uBP6G5kV0vP975TpIkSWJChlJIkiRJw2YwliRJkjAYS5IkSYDBWJIkSQIMxpIkSRJgMJYkSZIAg7EkSZIEGIwlSZIkAP4/bBfLlO2xFAUAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 864x432 with 2 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "display_dataset(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "775\n"
     ]
    }
   ],
   "source": [
    "filtered_cids = []\n",
    "for cid in list(set(df[\"case:concept:name\"].values)):\n",
    "    this_df = df[df[\"case:concept:name\"]==cid]\n",
    "    true_value = this_df.iloc[0]['total_time_true']\n",
    "    if true_value >= 250 and true_value <= 350:\n",
    "        if this_df.shape[0] >= 50 and this_df.shape[0] <= 75:\n",
    "            filtered_cids.append(cid)\n",
    "print(len(filtered_cids))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "200 50\n",
      "(14510, 49)\n"
     ]
    }
   ],
   "source": [
    "df_samples, train_cids, test_cids = sample_data(df, filtered_cids, 200, 50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "ERROR:root:[Errno 2] No such file or directory: '/Users/wangnaixuan/Documents/GitHub/prediction_service/data/drl_data/bpic2018/bpic2018_sample250.csv': read sample file failed.\n"
     ]
    }
   ],
   "source": [
    "from config import DeepReinforceLearningParameters\n",
    "\n",
    "save_path = DeepReinforceLearningParameters(dataset_name=\"bpic2018\").DATA_PATH\n",
    "\n",
    "df_samples.to_csv(save_path+\"/bpic2018_sample250.csv\", index=False)\n",
    "np.save(save_path+\"/train_case_ids.npy\", np.array(train_cids))\n",
    "np.save(save_path+\"/test_case_ids.npy\", np.array(test_cids))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### case 2: 读取case id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "train_cids = np.load(\"../data/drl_data/bpic2018/train_case_ids.npy\")\n",
    "test_cids = np.load(\"../data/drl_data/bpic2018/test_case_ids.npy\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(14510, 49)\n"
     ]
    }
   ],
   "source": [
    "df_sample = pd.DataFrame()\n",
    "for cid in set(df[\"case:concept:name\"]):\n",
    "    if cid in train_cids or cid in test_cids:\n",
    "        df_sample = df_sample.append(df[df[\"case:concept:name\"] == cid])\n",
    "print(df_sample.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "from config import DRLParameters\n",
    "\n",
    "save_path = DRLParameters(dataset_name=\"bpic2018\").DATA_PATH\n",
    "\n",
    "df_sample.to_csv(save_path+\"/bpic2018_nn_sample250.csv\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### BPIC2019数据处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4b745138ad5d4357a8bc386a98c84a6b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(HTML(value='parsing log, completed traces :: '), FloatProgress(value=0.0, max=251734.0), HTML(v…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "data_path = \"../data/raw_data/BPIC2019/BPI_Challenge_2019.xes\"\n",
    "\n",
    "import pm4py\n",
    "\n",
    "log = pm4py.read_xes(data_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pm4py.convert_to_dataframe(log)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>User</th>\n",
       "      <th>org:resource</th>\n",
       "      <th>concept:name</th>\n",
       "      <th>Cumulative net worth (EUR)</th>\n",
       "      <th>time:timestamp</th>\n",
       "      <th>case:Spend area text</th>\n",
       "      <th>case:Company</th>\n",
       "      <th>case:Document Type</th>\n",
       "      <th>case:Sub spend area text</th>\n",
       "      <th>case:Purchasing Document</th>\n",
       "      <th>...</th>\n",
       "      <th>case:Vendor</th>\n",
       "      <th>case:Item Type</th>\n",
       "      <th>case:Item Category</th>\n",
       "      <th>case:Spend classification text</th>\n",
       "      <th>case:Source</th>\n",
       "      <th>case:Name</th>\n",
       "      <th>case:GR-Based Inv. Verif.</th>\n",
       "      <th>case:Item</th>\n",
       "      <th>case:concept:name</th>\n",
       "      <th>case:Goods Receipt</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>batch_00</td>\n",
       "      <td>batch_00</td>\n",
       "      <td>SRM: Created</td>\n",
       "      <td>298.0</td>\n",
       "      <td>2018-01-02 12:53:00+00:00</td>\n",
       "      <td>CAPEX &amp; SOCS</td>\n",
       "      <td>companyID_0000</td>\n",
       "      <td>EC Purchase order</td>\n",
       "      <td>Facility Management</td>\n",
       "      <td>2000000000</td>\n",
       "      <td>...</td>\n",
       "      <td>vendorID_0000</td>\n",
       "      <td>Standard</td>\n",
       "      <td>3-way match, invoice before GR</td>\n",
       "      <td>NPR</td>\n",
       "      <td>sourceSystemID_0000</td>\n",
       "      <td>vendor_0000</td>\n",
       "      <td>False</td>\n",
       "      <td>00001</td>\n",
       "      <td>2000000000_00001</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>batch_00</td>\n",
       "      <td>batch_00</td>\n",
       "      <td>SRM: Complete</td>\n",
       "      <td>298.0</td>\n",
       "      <td>2018-01-02 13:53:00+00:00</td>\n",
       "      <td>CAPEX &amp; SOCS</td>\n",
       "      <td>companyID_0000</td>\n",
       "      <td>EC Purchase order</td>\n",
       "      <td>Facility Management</td>\n",
       "      <td>2000000000</td>\n",
       "      <td>...</td>\n",
       "      <td>vendorID_0000</td>\n",
       "      <td>Standard</td>\n",
       "      <td>3-way match, invoice before GR</td>\n",
       "      <td>NPR</td>\n",
       "      <td>sourceSystemID_0000</td>\n",
       "      <td>vendor_0000</td>\n",
       "      <td>False</td>\n",
       "      <td>00001</td>\n",
       "      <td>2000000000_00001</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>batch_00</td>\n",
       "      <td>batch_00</td>\n",
       "      <td>SRM: Awaiting Approval</td>\n",
       "      <td>298.0</td>\n",
       "      <td>2018-01-02 13:53:00+00:00</td>\n",
       "      <td>CAPEX &amp; SOCS</td>\n",
       "      <td>companyID_0000</td>\n",
       "      <td>EC Purchase order</td>\n",
       "      <td>Facility Management</td>\n",
       "      <td>2000000000</td>\n",
       "      <td>...</td>\n",
       "      <td>vendorID_0000</td>\n",
       "      <td>Standard</td>\n",
       "      <td>3-way match, invoice before GR</td>\n",
       "      <td>NPR</td>\n",
       "      <td>sourceSystemID_0000</td>\n",
       "      <td>vendor_0000</td>\n",
       "      <td>False</td>\n",
       "      <td>00001</td>\n",
       "      <td>2000000000_00001</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>batch_00</td>\n",
       "      <td>batch_00</td>\n",
       "      <td>SRM: Document Completed</td>\n",
       "      <td>298.0</td>\n",
       "      <td>2018-01-02 13:53:00+00:00</td>\n",
       "      <td>CAPEX &amp; SOCS</td>\n",
       "      <td>companyID_0000</td>\n",
       "      <td>EC Purchase order</td>\n",
       "      <td>Facility Management</td>\n",
       "      <td>2000000000</td>\n",
       "      <td>...</td>\n",
       "      <td>vendorID_0000</td>\n",
       "      <td>Standard</td>\n",
       "      <td>3-way match, invoice before GR</td>\n",
       "      <td>NPR</td>\n",
       "      <td>sourceSystemID_0000</td>\n",
       "      <td>vendor_0000</td>\n",
       "      <td>False</td>\n",
       "      <td>00001</td>\n",
       "      <td>2000000000_00001</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>batch_00</td>\n",
       "      <td>batch_00</td>\n",
       "      <td>SRM: In Transfer to Execution Syst.</td>\n",
       "      <td>298.0</td>\n",
       "      <td>2018-01-02 13:53:00+00:00</td>\n",
       "      <td>CAPEX &amp; SOCS</td>\n",
       "      <td>companyID_0000</td>\n",
       "      <td>EC Purchase order</td>\n",
       "      <td>Facility Management</td>\n",
       "      <td>2000000000</td>\n",
       "      <td>...</td>\n",
       "      <td>vendorID_0000</td>\n",
       "      <td>Standard</td>\n",
       "      <td>3-way match, invoice before GR</td>\n",
       "      <td>NPR</td>\n",
       "      <td>sourceSystemID_0000</td>\n",
       "      <td>vendor_0000</td>\n",
       "      <td>False</td>\n",
       "      <td>00001</td>\n",
       "      <td>2000000000_00001</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1595918</th>\n",
       "      <td>user_603</td>\n",
       "      <td>user_603</td>\n",
       "      <td>Change Approval for Purchase Order</td>\n",
       "      <td>1385.0</td>\n",
       "      <td>2019-01-17 14:00:00+00:00</td>\n",
       "      <td></td>\n",
       "      <td>companyID_0003</td>\n",
       "      <td>Framework order</td>\n",
       "      <td></td>\n",
       "      <td>4508076348</td>\n",
       "      <td>...</td>\n",
       "      <td>vendorID_1974</td>\n",
       "      <td>Limit</td>\n",
       "      <td>2-way match</td>\n",
       "      <td></td>\n",
       "      <td>sourceSystemID_0000</td>\n",
       "      <td>vendor_1898</td>\n",
       "      <td>False</td>\n",
       "      <td>00090</td>\n",
       "      <td>4508076348_00090</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1595919</th>\n",
       "      <td>user_602</td>\n",
       "      <td>user_602</td>\n",
       "      <td>Create Purchase Order Item</td>\n",
       "      <td>1385.0</td>\n",
       "      <td>2019-01-17 13:32:00+00:00</td>\n",
       "      <td></td>\n",
       "      <td>companyID_0003</td>\n",
       "      <td>Framework order</td>\n",
       "      <td></td>\n",
       "      <td>4508076348</td>\n",
       "      <td>...</td>\n",
       "      <td>vendorID_1974</td>\n",
       "      <td>Limit</td>\n",
       "      <td>2-way match</td>\n",
       "      <td></td>\n",
       "      <td>sourceSystemID_0000</td>\n",
       "      <td>vendor_1898</td>\n",
       "      <td>False</td>\n",
       "      <td>00100</td>\n",
       "      <td>4508076348_00100</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1595920</th>\n",
       "      <td>user_603</td>\n",
       "      <td>user_603</td>\n",
       "      <td>Change Approval for Purchase Order</td>\n",
       "      <td>1385.0</td>\n",
       "      <td>2019-01-17 14:00:00+00:00</td>\n",
       "      <td></td>\n",
       "      <td>companyID_0003</td>\n",
       "      <td>Framework order</td>\n",
       "      <td></td>\n",
       "      <td>4508076348</td>\n",
       "      <td>...</td>\n",
       "      <td>vendorID_1974</td>\n",
       "      <td>Limit</td>\n",
       "      <td>2-way match</td>\n",
       "      <td></td>\n",
       "      <td>sourceSystemID_0000</td>\n",
       "      <td>vendor_1898</td>\n",
       "      <td>False</td>\n",
       "      <td>00100</td>\n",
       "      <td>4508076348_00100</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1595921</th>\n",
       "      <td>user_602</td>\n",
       "      <td>user_602</td>\n",
       "      <td>Create Purchase Order Item</td>\n",
       "      <td>1385.0</td>\n",
       "      <td>2019-01-17 13:32:00+00:00</td>\n",
       "      <td></td>\n",
       "      <td>companyID_0003</td>\n",
       "      <td>Framework order</td>\n",
       "      <td></td>\n",
       "      <td>4508076348</td>\n",
       "      <td>...</td>\n",
       "      <td>vendorID_1974</td>\n",
       "      <td>Limit</td>\n",
       "      <td>2-way match</td>\n",
       "      <td></td>\n",
       "      <td>sourceSystemID_0000</td>\n",
       "      <td>vendor_1898</td>\n",
       "      <td>False</td>\n",
       "      <td>00110</td>\n",
       "      <td>4508076348_00110</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1595922</th>\n",
       "      <td>user_603</td>\n",
       "      <td>user_603</td>\n",
       "      <td>Change Approval for Purchase Order</td>\n",
       "      <td>1385.0</td>\n",
       "      <td>2019-01-17 14:00:00+00:00</td>\n",
       "      <td></td>\n",
       "      <td>companyID_0003</td>\n",
       "      <td>Framework order</td>\n",
       "      <td></td>\n",
       "      <td>4508076348</td>\n",
       "      <td>...</td>\n",
       "      <td>vendorID_1974</td>\n",
       "      <td>Limit</td>\n",
       "      <td>2-way match</td>\n",
       "      <td></td>\n",
       "      <td>sourceSystemID_0000</td>\n",
       "      <td>vendor_1898</td>\n",
       "      <td>False</td>\n",
       "      <td>00110</td>\n",
       "      <td>4508076348_00110</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1595923 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "             User org:resource                         concept:name  \\\n",
       "0        batch_00     batch_00                         SRM: Created   \n",
       "1        batch_00     batch_00                        SRM: Complete   \n",
       "2        batch_00     batch_00               SRM: Awaiting Approval   \n",
       "3        batch_00     batch_00              SRM: Document Completed   \n",
       "4        batch_00     batch_00  SRM: In Transfer to Execution Syst.   \n",
       "...           ...          ...                                  ...   \n",
       "1595918  user_603     user_603   Change Approval for Purchase Order   \n",
       "1595919  user_602     user_602           Create Purchase Order Item   \n",
       "1595920  user_603     user_603   Change Approval for Purchase Order   \n",
       "1595921  user_602     user_602           Create Purchase Order Item   \n",
       "1595922  user_603     user_603   Change Approval for Purchase Order   \n",
       "\n",
       "         Cumulative net worth (EUR)            time:timestamp  \\\n",
       "0                             298.0 2018-01-02 12:53:00+00:00   \n",
       "1                             298.0 2018-01-02 13:53:00+00:00   \n",
       "2                             298.0 2018-01-02 13:53:00+00:00   \n",
       "3                             298.0 2018-01-02 13:53:00+00:00   \n",
       "4                             298.0 2018-01-02 13:53:00+00:00   \n",
       "...                             ...                       ...   \n",
       "1595918                      1385.0 2019-01-17 14:00:00+00:00   \n",
       "1595919                      1385.0 2019-01-17 13:32:00+00:00   \n",
       "1595920                      1385.0 2019-01-17 14:00:00+00:00   \n",
       "1595921                      1385.0 2019-01-17 13:32:00+00:00   \n",
       "1595922                      1385.0 2019-01-17 14:00:00+00:00   \n",
       "\n",
       "        case:Spend area text    case:Company case:Document Type  \\\n",
       "0               CAPEX & SOCS  companyID_0000  EC Purchase order   \n",
       "1               CAPEX & SOCS  companyID_0000  EC Purchase order   \n",
       "2               CAPEX & SOCS  companyID_0000  EC Purchase order   \n",
       "3               CAPEX & SOCS  companyID_0000  EC Purchase order   \n",
       "4               CAPEX & SOCS  companyID_0000  EC Purchase order   \n",
       "...                      ...             ...                ...   \n",
       "1595918                       companyID_0003    Framework order   \n",
       "1595919                       companyID_0003    Framework order   \n",
       "1595920                       companyID_0003    Framework order   \n",
       "1595921                       companyID_0003    Framework order   \n",
       "1595922                       companyID_0003    Framework order   \n",
       "\n",
       "        case:Sub spend area text case:Purchasing Document  ...    case:Vendor  \\\n",
       "0            Facility Management               2000000000  ...  vendorID_0000   \n",
       "1            Facility Management               2000000000  ...  vendorID_0000   \n",
       "2            Facility Management               2000000000  ...  vendorID_0000   \n",
       "3            Facility Management               2000000000  ...  vendorID_0000   \n",
       "4            Facility Management               2000000000  ...  vendorID_0000   \n",
       "...                          ...                      ...  ...            ...   \n",
       "1595918                                        4508076348  ...  vendorID_1974   \n",
       "1595919                                        4508076348  ...  vendorID_1974   \n",
       "1595920                                        4508076348  ...  vendorID_1974   \n",
       "1595921                                        4508076348  ...  vendorID_1974   \n",
       "1595922                                        4508076348  ...  vendorID_1974   \n",
       "\n",
       "        case:Item Type              case:Item Category  \\\n",
       "0             Standard  3-way match, invoice before GR   \n",
       "1             Standard  3-way match, invoice before GR   \n",
       "2             Standard  3-way match, invoice before GR   \n",
       "3             Standard  3-way match, invoice before GR   \n",
       "4             Standard  3-way match, invoice before GR   \n",
       "...                ...                             ...   \n",
       "1595918          Limit                     2-way match   \n",
       "1595919          Limit                     2-way match   \n",
       "1595920          Limit                     2-way match   \n",
       "1595921          Limit                     2-way match   \n",
       "1595922          Limit                     2-way match   \n",
       "\n",
       "        case:Spend classification text          case:Source    case:Name  \\\n",
       "0                                  NPR  sourceSystemID_0000  vendor_0000   \n",
       "1                                  NPR  sourceSystemID_0000  vendor_0000   \n",
       "2                                  NPR  sourceSystemID_0000  vendor_0000   \n",
       "3                                  NPR  sourceSystemID_0000  vendor_0000   \n",
       "4                                  NPR  sourceSystemID_0000  vendor_0000   \n",
       "...                                ...                  ...          ...   \n",
       "1595918                                 sourceSystemID_0000  vendor_1898   \n",
       "1595919                                 sourceSystemID_0000  vendor_1898   \n",
       "1595920                                 sourceSystemID_0000  vendor_1898   \n",
       "1595921                                 sourceSystemID_0000  vendor_1898   \n",
       "1595922                                 sourceSystemID_0000  vendor_1898   \n",
       "\n",
       "        case:GR-Based Inv. Verif.  case:Item case:concept:name  \\\n",
       "0                           False      00001  2000000000_00001   \n",
       "1                           False      00001  2000000000_00001   \n",
       "2                           False      00001  2000000000_00001   \n",
       "3                           False      00001  2000000000_00001   \n",
       "4                           False      00001  2000000000_00001   \n",
       "...                           ...        ...               ...   \n",
       "1595918                     False      00090  4508076348_00090   \n",
       "1595919                     False      00100  4508076348_00100   \n",
       "1595920                     False      00100  4508076348_00100   \n",
       "1595921                     False      00110  4508076348_00110   \n",
       "1595922                     False      00110  4508076348_00110   \n",
       "\n",
       "        case:Goods Receipt  \n",
       "0                     True  \n",
       "1                     True  \n",
       "2                     True  \n",
       "3                     True  \n",
       "4                     True  \n",
       "...                    ...  \n",
       "1595918              False  \n",
       "1595919              False  \n",
       "1595920              False  \n",
       "1595921              False  \n",
       "1595922              False  \n",
       "\n",
       "[1595923 rows x 21 columns]"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1000 18\n",
      "2000 31\n",
      "3000 44\n",
      "4000 54\n",
      "5000 61\n",
      "6000 71\n",
      "7000 82\n",
      "8000 93\n",
      "9000 107\n",
      "10000 119\n",
      "11000 137\n",
      "12000 151\n",
      "13000 161\n",
      "14000 175\n",
      "15000 190\n",
      "16000 204\n",
      "17000 216\n",
      "18000 234\n",
      "19000 249\n",
      "20000 261\n",
      "21000 274\n",
      "22000 288\n",
      "23000 305\n",
      "24000 319\n",
      "25000 332\n",
      "26000 342\n",
      "27000 356\n",
      "28000 372\n",
      "29000 384\n",
      "30000 398\n",
      "31000 408\n",
      "32000 420\n",
      "33000 431\n",
      "34000 450\n",
      "35000 468\n",
      "36000 477\n",
      "37000 485\n",
      "38000 500\n",
      "39000 507\n",
      "40000 516\n",
      "41000 526\n",
      "42000 538\n",
      "43000 554\n",
      "44000 564\n",
      "45000 581\n",
      "46000 601\n",
      "47000 610\n",
      "48000 622\n",
      "49000 636\n",
      "50000 644\n",
      "51000 651\n",
      "52000 663\n",
      "53000 670\n",
      "54000 675\n",
      "55000 685\n",
      "56000 693\n",
      "57000 713\n",
      "58000 723\n",
      "59000 737\n",
      "60000 747\n",
      "61000 757\n",
      "62000 768\n",
      "63000 783\n",
      "64000 799\n",
      "65000 814\n",
      "66000 826\n",
      "67000 840\n",
      "68000 846\n",
      "69000 856\n",
      "70000 869\n",
      "71000 880\n",
      "72000 892\n",
      "73000 904\n",
      "74000 918\n",
      "75000 928\n",
      "76000 943\n",
      "77000 953\n",
      "78000 963\n",
      "79000 974\n",
      "80000 983\n",
      "81000 999\n",
      "82000 1020\n",
      "83000 1030\n",
      "84000 1043\n",
      "85000 1053\n",
      "86000 1069\n",
      "87000 1082\n",
      "88000 1093\n",
      "89000 1104\n",
      "90000 1120\n",
      "91000 1132\n",
      "92000 1149\n",
      "93000 1162\n",
      "94000 1171\n",
      "95000 1184\n",
      "96000 1196\n",
      "97000 1208\n",
      "98000 1220\n",
      "99000 1227\n",
      "100000 1240\n",
      "101000 1248\n",
      "102000 1259\n",
      "103000 1264\n",
      "104000 1270\n",
      "105000 1285\n",
      "106000 1296\n",
      "107000 1317\n",
      "108000 1333\n",
      "109000 1341\n",
      "110000 1348\n",
      "111000 1360\n",
      "112000 1372\n",
      "113000 1384\n",
      "114000 1396\n",
      "115000 1404\n",
      "116000 1415\n",
      "117000 1425\n",
      "118000 1435\n",
      "119000 1448\n",
      "120000 1459\n",
      "121000 1472\n",
      "122000 1480\n",
      "123000 1492\n",
      "124000 1505\n",
      "125000 1516\n",
      "126000 1525\n",
      "127000 1546\n",
      "128000 1553\n",
      "129000 1565\n",
      "130000 1573\n",
      "131000 1591\n",
      "132000 1600\n",
      "133000 1611\n",
      "134000 1621\n",
      "135000 1634\n",
      "136000 1650\n",
      "137000 1658\n",
      "138000 1668\n",
      "139000 1678\n",
      "140000 1689\n",
      "141000 1700\n",
      "142000 1706\n",
      "143000 1719\n",
      "144000 1730\n",
      "145000 1745\n",
      "146000 1759\n",
      "147000 1773\n",
      "148000 1783\n",
      "149000 1798\n",
      "150000 1816\n",
      "151000 1826\n",
      "152000 1836\n",
      "153000 1845\n",
      "154000 1855\n",
      "155000 1875\n",
      "156000 1888\n",
      "157000 1903\n",
      "158000 1913\n",
      "159000 1922\n",
      "160000 1933\n",
      "161000 1945\n",
      "162000 1964\n",
      "163000 1976\n",
      "164000 1987\n",
      "165000 1998\n",
      "166000 2013\n",
      "167000 2024\n",
      "168000 2034\n",
      "169000 2047\n",
      "170000 2062\n",
      "171000 2066\n",
      "172000 2076\n",
      "173000 2082\n",
      "174000 2096\n",
      "175000 2108\n",
      "176000 2122\n",
      "177000 2135\n",
      "178000 2148\n",
      "179000 2165\n",
      "180000 2170\n",
      "181000 2179\n",
      "182000 2188\n",
      "183000 2206\n",
      "184000 2219\n",
      "185000 2231\n",
      "186000 2248\n",
      "187000 2260\n",
      "188000 2271\n",
      "189000 2286\n",
      "190000 2292\n",
      "191000 2304\n",
      "192000 2317\n",
      "193000 2321\n",
      "194000 2333\n",
      "195000 2349\n",
      "196000 2361\n",
      "197000 2371\n",
      "198000 2385\n",
      "199000 2400\n",
      "200000 2409\n",
      "201000 2419\n",
      "202000 2433\n",
      "203000 2443\n",
      "204000 2455\n",
      "205000 2471\n",
      "206000 2482\n",
      "207000 2497\n",
      "208000 2511\n",
      "209000 2519\n",
      "210000 2530\n",
      "211000 2539\n",
      "212000 2550\n",
      "213000 2562\n",
      "214000 2570\n",
      "215000 2585\n",
      "216000 2598\n",
      "217000 2601\n",
      "218000 2616\n",
      "219000 2628\n",
      "220000 2646\n",
      "221000 2666\n",
      "222000 2676\n",
      "223000 2685\n",
      "224000 2701\n",
      "225000 2708\n",
      "226000 2716\n",
      "227000 2731\n",
      "228000 2742\n",
      "229000 2754\n",
      "230000 2766\n",
      "231000 2783\n",
      "232000 2794\n",
      "233000 2807\n",
      "234000 2820\n",
      "235000 2824\n",
      "236000 2835\n",
      "237000 2843\n",
      "238000 2863\n",
      "239000 2879\n",
      "240000 2891\n",
      "241000 2907\n",
      "242000 2916\n",
      "243000 2927\n",
      "244000 2934\n",
      "245000 2949\n",
      "246000 2960\n",
      "247000 2967\n",
      "248000 2981\n",
      "249000 2990\n",
      "250000 3005\n",
      "251000 3019\n"
     ]
    }
   ],
   "source": [
    "valid_cnts = []\n",
    "cid_cnt = 0\n",
    "for cid in set(df[\"case:concept:name\"]):\n",
    "    cid_cnt += 1\n",
    "    if cid_cnt % 1000 == 0:\n",
    "        print(cid_cnt, len(valid_cnts))\n",
    "    tmp_df = df[df[\"case:concept:name\"] == cid]\n",
    "    if tmp_df.shape[0] >= 15 and tmp_df.shape[0] <= 50:\n",
    "        valid_cnts.append(cid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 4507018870_00010\n",
      "100 4507025387_00140\n",
      "200 4508073952_00001\n",
      "300 4507027248_00001\n",
      "400 4508061004_00001\n",
      "500 4508054468_00001\n",
      "600 4507037431_00001\n",
      "700 4507076129_00170\n",
      "800 4507027486_00001\n",
      "900 4508060674_00001\n",
      "1000 4508054833_00070\n",
      "1100 4507014548_00001\n",
      "1200 4507002187_00070\n",
      "1300 4507024263_00001\n",
      "1400 4507021930_00130\n",
      "1500 4507076145_00010\n",
      "1600 4507005373_00001\n",
      "1700 4507017351_00020\n",
      "1800 4508050666_00001\n",
      "1900 4507026043_00001\n",
      "2000 4508073285_00010\n",
      "2100 4507023412_00001\n",
      "2200 4507042048_00001\n",
      "2300 4508055407_00001\n",
      "2400 4508071372_00001\n",
      "2500 4508066517_00001\n",
      "2600 4507019734_00020\n",
      "2700 4508058238_00001\n",
      "2800 2000013457_00001\n",
      "2900 4508055192_00001\n",
      "3000 4507003051_00001\n",
      "(77288, 21)\n"
     ]
    }
   ],
   "source": [
    "df_filtered = pd.DataFrame()\n",
    "\n",
    "cnt = 0\n",
    "for cid in valid_cnts:\n",
    "    if cnt % 100 == 0:\n",
    "        print(cnt, cid)\n",
    "    cnt += 1\n",
    "    this_df = df[df[\"case:concept:name\"] == cid]\n",
    "    df_filtered = pd.concat([df_filtered, this_df])\n",
    "print(df_filtered.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_f = df_filtered[[\"concept:name\", \"Cumulative net worth (EUR)\", \"time:timestamp\", \"case:Company\", \"case:Document Type\", \"case:Item Type\",\n",
    "                   \"case:Item Category\", \"case:GR-Based Inv. Verif.\", \"case:Item\", \"case:concept:name\", \"case:Goods Receipt\"]]\n",
    "df_f.to_csv(\"../data/filtered_data/BPIC2019/filtered_data.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "1000\n",
      "2000\n",
      "3000\n",
      "4000\n",
      "5000\n",
      "6000\n",
      "7000\n",
      "8000\n",
      "9000\n",
      "10000\n",
      "11000\n",
      "12000\n",
      "13000\n",
      "14000\n",
      "15000\n",
      "16000\n",
      "17000\n",
      "18000\n",
      "19000\n",
      "20000\n",
      "21000\n",
      "22000\n",
      "23000\n",
      "24000\n",
      "25000\n",
      "26000\n",
      "27000\n",
      "28000\n",
      "29000\n",
      "30000\n",
      "31000\n",
      "32000\n",
      "33000\n",
      "34000\n",
      "35000\n",
      "36000\n",
      "37000\n",
      "38000\n",
      "39000\n",
      "40000\n",
      "41000\n",
      "42000\n",
      "43000\n",
      "44000\n",
      "45000\n",
      "46000\n",
      "47000\n",
      "48000\n",
      "49000\n",
      "50000\n",
      "51000\n",
      "52000\n",
      "53000\n",
      "54000\n",
      "55000\n",
      "56000\n",
      "57000\n",
      "58000\n",
      "59000\n",
      "60000\n",
      "61000\n",
      "62000\n",
      "63000\n",
      "64000\n",
      "65000\n",
      "66000\n",
      "67000\n",
      "68000\n",
      "69000\n",
      "70000\n",
      "71000\n",
      "72000\n",
      "73000\n",
      "74000\n",
      "75000\n",
      "76000\n",
      "77000\n",
      "(77288, 11)\n"
     ]
    }
   ],
   "source": [
    "from data_cleaner.data_preprocess import BPIC2019Preprocess\n",
    "\n",
    "df = BPIC2019Preprocess().preprocess()\n",
    "print(df.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "jupyter": {
     "source_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train_data_size:  2116 , test_data_size:  907\n"
     ]
    }
   ],
   "source": [
    "def prepare_train_data(data_processed):\n",
    "    all_case_ids = set(data_processed['case:concept:name'].values)\n",
    "    num_trainset = len(all_case_ids) * 7 // 10\n",
    "    random.seed(0)\n",
    "    train_cids = random.sample(all_case_ids, num_trainset)\n",
    "    train_cids_set = set(train_cids)\n",
    "    test_cids_set = all_case_ids - train_cids_set\n",
    "    print('train_data_size: ', len(train_cids_set), ', test_data_size: ', len(test_cids_set))\n",
    "\n",
    "    activity_names = np.array(data_processed['concept:name'].values)\n",
    "    activity_names = np.reshape(activity_names, (activity_names.shape[0], 1))\n",
    "    ohe_act = OneHotEncoder(sparse=False)\n",
    "    ohe_act.fit(activity_names)\n",
    "    \n",
    "    company_names = np.array(data_processed['case:Company'].values)\n",
    "    company_names = np.reshape(company_names, (company_names.shape[0], 1))\n",
    "    ohe_cn = OneHotEncoder(sparse=False)\n",
    "    ohe_cn.fit(company_names)\n",
    "    \n",
    "    document_types = np.array(data_processed['case:Document Type'].values)\n",
    "    document_types = np.reshape(document_types, (document_types.shape[0], 1))\n",
    "    ohe_dt = OneHotEncoder(sparse=False)\n",
    "    ohe_dt.fit(document_types)\n",
    "    \n",
    "    item_types = np.array(data_processed['case:Item Type'].values)\n",
    "    item_types = np.reshape(item_types, (item_types.shape[0], 1))\n",
    "    ohe_it = OneHotEncoder(sparse=False)\n",
    "    ohe_it.fit(item_types)\n",
    "    \n",
    "    item_categories = np.array(data_processed['case:Item Category'].values)\n",
    "    item_categories = np.reshape(item_categories, (item_categories.shape[0], 1))\n",
    "    ohe_ic = OneHotEncoder(sparse=False)\n",
    "    ohe_ic.fit(item_categories)\n",
    "    \n",
    "    bools = np.array(data_processed['case:GR-Based Inv. Verif.'].values)\n",
    "    bools = np.reshape(bools, (bools.shape[0], 1))\n",
    "    ohe_bs = OneHotEncoder(sparse=False)\n",
    "    ohe_bs.fit(bools)\n",
    "    \n",
    "    items = np.array(data_processed['case:Item'].values)\n",
    "    items = np.reshape(items, (items.shape[0], 1))\n",
    "    ohe_is = OneHotEncoder(sparse=False)\n",
    "    ohe_is.fit(items)\n",
    "    \n",
    "    weeks = np.array(list(range(0, 7)))\n",
    "    weeks = np.reshape(weeks, (weeks.shape[0], 1))\n",
    "    ohe_week = OneHotEncoder(sparse=False)\n",
    "    ohe_week.fit(weeks)\n",
    "\n",
    "    def generate_data(cid_set):\n",
    "        data_set = []\n",
    "        trace_length = 0\n",
    "        for cid in cid_set:\n",
    "            thisdf = data_processed[data_processed['case:concept:name'] == cid]\n",
    "            trace_length = max(trace_length, thisdf.shape[0])\n",
    "            tmpdata = []\n",
    "            \n",
    "            start_time = datetime.datetime.strptime(thisdf.iloc[0]['time:timestamp_short'], '%Y-%m-%d %H:%M:%S')\n",
    "            end_time = datetime.datetime.strptime(thisdf.iloc[-1]['time:timestamp_short'], '%Y-%m-%d %H:%M:%S')\n",
    "            last_time = start_time\n",
    "\n",
    "            for i in range(thisdf.shape[0]):\n",
    "                row = [int(thisdf.iloc[i]['case:concept:name'])]\n",
    "                \n",
    "                event_dt = datetime.datetime.strptime(thisdf.iloc[i]['time:timestamp_short'], '%Y-%m-%d %H:%M:%S')\n",
    "                midnight_time = event_dt.replace(hour=0, minute=0, second=0, microsecond=0)\n",
    "                \n",
    "                activity_name = np.array(thisdf.iloc[i]['concept:name'])\n",
    "                activity_name = np.reshape(activity_name, (-1, 1))\n",
    "                company_name = np.array(thisdf.iloc[i]['case:Company'])\n",
    "                company_name = np.reshape(company_name, (-1, 1))\n",
    "                document_type = np.array(thisdf.iloc[i]['case:Document Type'])\n",
    "                document_type = np.reshape(document_type, (-1, 1))\n",
    "                item_type = np.array(thisdf.iloc[i]['case:Item Type'])\n",
    "                item_type = np.reshape(item_type, (-1, 1))\n",
    "                item_category = np.array(thisdf.iloc[i]['case:Item Category'])\n",
    "                item_category = np.reshape(item_category, (-1, 1))\n",
    "                gr_based_inv = np.array(thisdf.iloc[i]['case:GR-Based Inv. Verif.'])\n",
    "                gr_based_inv = np.reshape(gr_based_inv, (-1, 1))\n",
    "                item = np.array(thisdf.iloc[i]['case:Item'])\n",
    "                item = np.reshape(item, (-1, 1))\n",
    "                goods_receipt = np.array(thisdf.iloc[i]['case:Goods Receipt'])\n",
    "                goods_receipt = np.reshape(goods_receipt, (-1, 1))\n",
    "                \n",
    "                weekday = event_dt.weekday()\n",
    "                weekday = np.reshape(np.array([weekday]), (-1, 1))\n",
    "                \n",
    "                row.extend(ohe_act.transform(activity_name).tolist()[0])  # 活动名称one-hot编码\n",
    "                row.extend(ohe_cn.transform(company_name).tolist()[0])\n",
    "                row.extend(ohe_dt.transform(document_type).tolist()[0])\n",
    "                row.extend(ohe_it.transform(item_type).tolist()[0])\n",
    "                row.extend(ohe_ic.transform(item_category).tolist()[0])\n",
    "                row.extend(ohe_bs.transform(gr_based_inv).tolist()[0])\n",
    "                row.extend(ohe_is.transform(item).tolist()[0])\n",
    "                row.extend(ohe_bs.transform(goods_receipt).tolist()[0])\n",
    "                row.extend(ohe_week.transform(weekday).tolist()[0])\n",
    "                \n",
    "                row.append(thisdf.iloc[i]['Cumulative net worth (EUR)'])\n",
    "                \n",
    "                row.append((event_dt - start_time).total_seconds()/3600/24)  # 总花费时间\n",
    "                row.append((event_dt - last_time).total_seconds()/3600/24)  # 相比上次活动花费时间\n",
    "                last_time = event_dt\n",
    "                row.append((event_dt - midnight_time).total_seconds()/3600/24)  # 距午夜时间\n",
    "                \n",
    "                row.append((end_time - event_dt).total_seconds()/3600/24)\n",
    "\n",
    "                tmpdata.append(row)\n",
    "                if i != thisdf.shape[0] - 1:\n",
    "                    data_set.append(tmpdata.copy())\n",
    "        return data_set, trace_length\n",
    "\n",
    "    train_data_set, max_train_trace_length = generate_data(train_cids_set)\n",
    "    test_data_set, max_test_trace_length = generate_data(test_cids_set)\n",
    "\n",
    "    min_value = [1e20] * (len(train_data_set[0][0]) - 1)\n",
    "    max_value = [-1] * (len(train_data_set[0][0]) - 1)\n",
    "    for element in train_data_set:\n",
    "        for row in element:\n",
    "            for i in range(1, len(row)-1):\n",
    "                min_value[i] = min(min_value[i], row[i])\n",
    "                max_value[i] = max(max_value[i], row[i])\n",
    "    train_data_new = []\n",
    "    for i in range(len(train_data_set)):\n",
    "        seq = []\n",
    "        for j in range(len(train_data_set[i])):\n",
    "            row = [train_data_set[i][j][0]]\n",
    "            for k in range(1, len(train_data_set[i][j])-1):\n",
    "                if max_value[k] == min_value[k]:\n",
    "                    row.append(train_data_set[i][j][k])\n",
    "                else:\n",
    "                    row.append((train_data_set[i][j][k]-min_value[k]) / (max_value[k]-min_value[k]))\n",
    "            row.append(train_data_set[i][j][-1])\n",
    "            seq.append(row)\n",
    "        train_data_new.append(seq)\n",
    "    test_data_new = []\n",
    "    for i in range(len(test_data_set)):\n",
    "        seq = []\n",
    "        for j in range(len(test_data_set[i])):\n",
    "            row = [test_data_set[i][j][0]]\n",
    "            for k in range(1, len(test_data_set[i][j])-1):\n",
    "                if max_value[k] == min_value[k]:\n",
    "                    row.append(test_data_set[i][j][k])\n",
    "                else:\n",
    "                    row.append((test_data_set[i][j][k]-min_value[k]) / (max_value[k]-min_value[k]))\n",
    "            row.append(test_data_set[i][j][-1])\n",
    "            seq.append(row)\n",
    "        test_data_new.append(seq)\n",
    "\n",
    "    train_data_set = sequence.pad_sequences(train_data_new, maxlen=20, dtype='float64')\n",
    "    test_data_set = sequence.pad_sequences(test_data_new, maxlen=20, dtype='float64')\n",
    "    return train_data_set, test_data_set\n",
    "\n",
    "train_data_set, test_data_set = prepare_train_data(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.save(\"../data/filtered_data/BPIC2019/train_data.npy\", train_data_set)\n",
    "np.save(\"../data/filtered_data/BPIC2019/test_data.npy\", test_data_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                 concept:name  Cumulative net worth (EUR)  \\\n",
      "0  Create Purchase Order Item                      5049.0   \n",
      "1             Change Quantity                      5049.0   \n",
      "2      Vendor creates invoice                      5049.0   \n",
      "3        Record Goods Receipt                      5049.0   \n",
      "4             Change Quantity                      5049.0   \n",
      "\n",
      "  time:timestamp_short    case:Company case:Document Type case:Item Type  \\\n",
      "0  2018-03-28 12:26:00  companyID_0000        Standard PO       Standard   \n",
      "1  2018-03-29 07:16:00  companyID_0000        Standard PO       Standard   \n",
      "2  2018-05-29 21:59:00  companyID_0000        Standard PO       Standard   \n",
      "3  2018-05-30 08:34:00  companyID_0000        Standard PO       Standard   \n",
      "4  2018-05-30 10:06:00  companyID_0000        Standard PO       Standard   \n",
      "\n",
      "               case:Item Category  case:GR-Based Inv. Verif.  case:Item  \\\n",
      "0  3-way match, invoice before GR                      False         30   \n",
      "1  3-way match, invoice before GR                      False         30   \n",
      "2  3-way match, invoice before GR                      False         30   \n",
      "3  3-way match, invoice before GR                      False         30   \n",
      "4  3-way match, invoice before GR                      False         30   \n",
      "\n",
      "   case:concept:name  case:Goods Receipt  time_spent  total_time_pred  \\\n",
      "0         2065874945                True    0.000000       100.277107   \n",
      "1         2065874945                True    0.784722        85.436990   \n",
      "2         2065874945                True   62.397917       154.795370   \n",
      "3         2065874945                True   62.838889        90.531699   \n",
      "4         2065874945                True   62.902778       120.635104   \n",
      "\n",
      "   total_time_true  \n",
      "0       134.002083  \n",
      "1       134.002083  \n",
      "2       134.002083  \n",
      "3       134.002083  \n",
      "4       134.002083  \n"
     ]
    }
   ],
   "source": [
    "data_path = \"../data/drl_data/bpic2019/bpic2019_wp_nn.csv\"\n",
    "\n",
    "df = pd.read_csv(data_path)\n",
    "print(df.head(5))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### case 1: 抽样case id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3023 3023\n",
      "15 50 0.0 6536.500694444445\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAsYAAAF1CAYAAADr3izzAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAjlklEQVR4nO3dfbRddX3n8ffHgKCiFYYLjUnGoE0dwTWCzcpg6WoZsYWCNbjW0BU72syUKa4OTrWjYxPtqtrVdLD1oe2aYicqNVqVZnwYUtFWTGVZpxYMCEhASiwpuRDJ9RmdTirhO3+cX/T0em9y7r3n6ea+X2uddfb57YfzPfue/M4nv7PP3qkqJEmSpKXuMaMuQJIkSRoHBmNJkiQJg7EkSZIEGIwlSZIkwGAsSZIkAQZjSZIkCTAY6xiRZG+S54/geVcnqSTHDfu5JUkzS/LuJL896jq0+BiMNW+jCqOjtBRfsySB/Z+WBoOxBsZRVElaOuzzdSwwGGtekrwX+JfAnyf5dpLXdB1WcHmS+4G/asv+ryRfTvLNJJ9OclbXdh6X5C1J/qHN/0ySx7V55yb5myTfSHJ7kvN7rO0xSTYl+VKSrybZnuSUNu9wjRuT3J/kK0leN62ebUm+nuTu9romZ3vNXU/772faniQdC3rt85Ocf7jP7Fr3eyPNR+qfZ3jOu5O8oOvxca2PfU57POtny7Tt/Ickn5nWVkl+pE2fkOTNrQ9/KMkfd30OnZrko+1z6GtJ/jqJ2ekY5h9X81JVLwXuB36uqk6qqt/tmv1TwDOBC9vjjwNrgNOAW4H3dS37ZuDHgB8HTgFeAzyaZAVwPfDbrf3VwIeSTPRQ3q8Cl7Y6ngJ8Hfijacv8BPAM4ALgN5M8s7W/HlgNPA34aeAlPb7m2bYnSYveHPv8I+mlfz7sA8CLux5fCHylqm5tj4/02TIXbwJ+FDgb+BFgBfCbbd6rgElgAjgdeC1Q83weLQIGYw3CG6rqO1X1jwBVdU1VPVxVB4E3AM9O8kPtf92/BLyiqh6oqkNV9TdtuZcAH6uqj1XVo1V1A7ALuLiH538Z8Lqqmux6zn837Wu+N1bVP1bV7cDtwLNb+88Dv1NVX6+qSeAPe3zNs21Pko51/6zPP4pe+ufD3g+8MMnj2+NfaG3A7J8tcyk8SYBfBn6tqr5WVQ8DvwNsaIt8F1gOPLWqvltVf11VBuNjmMcDaRD2HZ5IsgzYAlxG53/cj7ZZpwInACcCX5phG08FLkvyc11txwOf6uH5nwp8JMmjXW2H6Pxv/7Avd03/X+CkNv2U7vqnTR/JbNuTpGNdr/0kHLl/fqB7warak+Ru4OeS/DnwQuAcOOpnyzfnUM8E8Hjglk5GBiDAsjb9e3RC9yfa/K1VddUctq9FxmCshZjtf83d7b8ArAeeD+wFfojOV2cBvgL8P+DpdEZZu+0D3ltVvzyPuvYBv1RV/2f6jCSrj7LufmAlcFd7vGrafEcKJC1VvfT536ETNIHvBdjuQ+Bm7Z9ncfhwiscAd1XVntZ+pM+W6abX9MNd874C/CNwVlU9MH3FNoL8KuBV7RjmTyX5XFXt7LF+LTIeSqGFeIjOsbhH8kTgIPBVOh3T7xyeUVWPAtcAb03ylCTLkjw3yQnAn9IZJbiwtZ/YftSxsoe6/hjYkuSpAEkmkqzv8TVtBzYnObkd5/zyafN7ec2SdCzqpf/7O+DEJJckOR74DTrfDh421/75WuBngF+h6zAKjvDZMoPbgbOSnJ3kRDojwMD3PofeAbwtyWmtphVJLmzTL0jyI+2Qi2/RGd0+dJR9oEXMYKyF+O/Ab7Rf6756lmXeA/wDna/I7gL+dtr8VwNfAD4HfI3OjyAeU1X76IwGvBaYojPK8N/o7T37B8AOOl99Pdye89/0+Jp+i84PLe4DPgl8kE7ne1gvr1mSjkVH7f+q6pvAfwbeSaff/w6dPvWwOfXPVbUf+CydH2j/Wdeso322dG/j7+j07Z8E7gU+M22RXwf2AH+b5FttuWe0eWva42+3Oq6uqhtney4tfvEYcml2SX4F2FBVPzXqWiRJ0mA5Yix1SbI8yXntXJvPoHNs2UdGXZckSRo8f3wn/XOPBf4ncAbwDTrHt109yoIkSdJweCiFJEmShIdSSJIkSYDBWJIkSQLG5BjjU089tVavXj3qMiQtcbfccstXqmri6EtqPuzrJY2DI/X1YxGMV69eza5du0ZdhqQlLsk/jLqGY5l9vaRxcKS+3kMpJEmSJAzGkiRJEmAwliRJkgCDsSRJkgQYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkiRJgMFYkiRJAgzGkiRJEmAwliRJkgCDsSRJkgQYjCVJkiQAjht1AQuxetP1s87be9UlQ6xEkiRJi50jxpIkSRIGY0mSJAkwGEuSJEmAwViSJEkCDMaSJEkSYDCWJEmSAIOxJEmSBBiMJUmSJKCHYJzkxCQ3J7k9ye4kb2ztb0jyQJLb2u3irnU2J9mT5J4kFw7yBUiSJEn90MuV7w4Cz6uqbyc5HvhMko+3eW+rqjd3L5zkTGADcBbwFOCTSX60qg71s3BJkiSpn446Ylwd324Pj2+3OsIq64Frq+pgVd0H7AHWLbhSSZIkaYB6OsY4ybIktwEHgBuq6qY26+VJ7khyTZKTW9sKYF/X6pOtTZIkSRpbPQXjqjpUVWcDK4F1SZ4FvB14OnA2sB94S1s8M21iekOSK5LsSrJrampqHqVLkiRJ/TOns1JU1TeAG4GLquqhFpgfBd7B9w+XmARWda22Enhwhm1traq1VbV2YmJiPrVLkiRJfdPLWSkmkjy5TT8OeD7wxSTLuxZ7EXBnm94BbEhyQpIzgDXAzX2tWpIkSeqzXs5KsRzYlmQZnSC9vao+muS9Sc6mc5jEXuBlAFW1O8l24C7gEeBKz0ghSZKkcXfUYFxVdwDnzND+0iOsswXYsrDSJEmSpOHxyneSJEkSBmNJkiQJMBhLkiRJgMFYkiRJAgzGkiRJEmAwliRJkgCDsSRJkgQYjCVpyUhyTZIDSe7sajslyQ1J7m33J3fN25xkT5J7klzY1f5jSb7Q5v1hkgz7tUjSIBiMJWnpeDdw0bS2TcDOqloD7GyPSXImsAE4q61zdbsCKsDbgSuANe02fZuStCgZjCVpiaiqTwNfm9a8HtjWprcBl3a1X1tVB6vqPmAPsC7JcuBJVfXZqirgPV3rSNKiZjCWpKXt9KraD9DuT2vtK4B9XctNtrYVbXp6+4ySXJFkV5JdU1NTfS1ckvrNYCxJmslMxw3XEdpnVFVbq2ptVa2dmJjoW3GSNAgGY0la2h5qh0fQ7g+09klgVddyK4EHW/vKGdoladEzGEvS0rYD2NimNwLXdbVvSHJCkjPo/Mju5na4xcNJzm1no/jFrnUkaVE7btQFSJKGI8kHgPOBU5NMAq8HrgK2J7kcuB+4DKCqdifZDtwFPAJcWVWH2qZ+hc4ZLh4HfLzdJGnRMxhL0hJRVS+eZdYFsyy/BdgyQ/su4Fl9LE2SxoKHUkiSJEkYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkiRJgMFYkiRJAgzGkiRJEmAwliRJkgCDsSRJkgQYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkiRJgMFYkiRJAgzGkiRJEmAwliRJkgCDsSRJkgQYjCVJkiSgh2Cc5MQkNye5PcnuJG9s7ackuSHJve3+5K51NifZk+SeJBcO8gVIkiRJ/dDLiPFB4HlV9WzgbOCiJOcCm4CdVbUG2Nkek+RMYANwFnARcHWSZQOoXZIkSeqbowbj6vh2e3h8uxWwHtjW2rcBl7bp9cC1VXWwqu4D9gDr+lm0JEmS1G89HWOcZFmS24ADwA1VdRNwelXtB2j3p7XFVwD7ulafbG3Tt3lFkl1Jdk1NTS3gJUiSJEkL11MwrqpDVXU2sBJYl+RZR1g8M21ihm1uraq1VbV2YmKip2IlSZKkQZnTWSmq6hvAjXSOHX4oyXKAdn+gLTYJrOpabSXw4EILlSRJkgapl7NSTCR5cpt+HPB84IvADmBjW2wjcF2b3gFsSHJCkjOANcDNfa5bkiRJ6qvjelhmObCtnVniMcD2qvpoks8C25NcDtwPXAZQVbuTbAfuAh4BrqyqQ4MpX5IkSeqPowbjqroDOGeG9q8CF8yyzhZgy4KrkyRJkobEK99JkiRJGIwlSZIkwGAsSZIkAQZjSZIkCTAYS5IkSYDBWJIkSQIMxpIkSRJgMJYkSZIAg7EkSZIEGIwlSZIkwGAsSZIkAQZjSZIkCTAYS5IkSYDBWJIkSQIMxpIkSRJgMJYkSZIAg7EkSZIEGIwlSZIkwGAsSZIkAQZjSZIkCTAYS5KAJL+WZHeSO5N8IMmJSU5JckOSe9v9yV3Lb06yJ8k9SS4cZe2S1C8GY0la4pKsAH4VWFtVzwKWARuATcDOqloD7GyPSXJmm38WcBFwdZJlo6hdkvrpuFEXMCirN10/67y9V10yxEokaVE4Dnhcku8CjwceBDYD57f524AbgV8H1gPXVtVB4L4ke4B1wGeHXLMk9ZUjxpK0xFXVA8CbgfuB/cA3q+oTwOlVtb8tsx84ra2yAtjXtYnJ1iZJi5rBWJKWuHbs8HrgDOApwBOSvORIq8zQVrNs+4oku5LsmpqaWnixkjRABmNJ0vOB+6pqqqq+C3wY+HHgoSTLAdr9gbb8JLCqa/2VdA69+AFVtbWq1lbV2omJiYG9AEnqB4OxJOl+4Nwkj08S4ALgbmAHsLEtsxG4rk3vADYkOSHJGcAa4OYh1yxJfXfM/vhOktSbqropyQeBW4FHgM8DW4GTgO1JLqcTni9ry+9Osh24qy1/ZVUdGknxktRHBmNJElX1euD105oP0hk9nmn5LcCWQdclScPkoRSSJEkSBmNJkiQJMBhLkiRJgMFYkiRJAgzGkiRJEtBDME6yKsmnktydZHeSV7T2NyR5IMlt7XZx1zqbk+xJck+SCwf5AiRJkqR+6OV0bY8Ar6qqW5M8EbglyQ1t3tuq6s3dCyc5E9gAnEXn0qKfTPKjnuNSkiRJ4+yoI8ZVtb+qbm3TD9O5GtKKI6yyHri2qg5W1X3AHmBdP4qVJEmSBmVOxxgnWQ2cA9zUml6e5I4k1yQ5ubWtAPZ1rTbJDEE6yRVJdiXZNTU1NffKJUmSpD7qORgnOQn4EPDKqvoW8Hbg6cDZwH7gLYcXnWH1+oGGqq1Vtbaq1k5MTMy1bkmSJKmvegrGSY6nE4rfV1UfBqiqh6rqUFU9CryD7x8uMQms6lp9JfBg/0qWJEmS+q+Xs1IEeBdwd1W9tat9eddiLwLubNM7gA1JTkhyBrAGuLl/JUuSJEn918tZKc4DXgp8Icltre21wIuTnE3nMIm9wMsAqmp3ku3AXXTOaHGlZ6SQJEnSuDtqMK6qzzDzccMfO8I6W4AtC6hLkiRJGiqvfCdJkiRhMJYkSZIAg7EkSZIEGIwlSZIkwGAsSZIkAQZjSZIkCTAYS5IkSYDBWJIkSQIMxpIkSRJgMJYkSZIAg7EkSZIEGIwlSZIkwGAsSZIkAQZjSZIkCTAYS5IkSYDBWJIkSQIMxpIkSRJgMJYkSZIAg7EkSZIEGIwlSZIkwGAsSZIkAQZjSZIkCTAYS5IkSYDBWJIkSQIMxpIkSRJgMJYkSZIAg7EkSZIEGIwlSZIkwGAsSZIkAQZjSZIkCTAYS5IkSYDBWJIkSQIMxpIkSRLQQzBOsirJp5LcnWR3kle09lOS3JDk3nZ/ctc6m5PsSXJPkgsH+QIkSZKkfuhlxPgR4FVV9UzgXODKJGcCm4CdVbUG2Nke0+ZtAM4CLgKuTrJsEMVLkvojyZOTfDDJF9tAyHMdAJG01Bw1GFfV/qq6tU0/DNwNrADWA9vaYtuAS9v0euDaqjpYVfcBe4B1fa5bktRffwD8RVX9K+DZdPp6B0AkLSlzOsY4yWrgHOAm4PSq2g+d8Ayc1hZbAezrWm2ytUmSxlCSJwE/CbwLoKr+qaq+gQMgkpaYnoNxkpOADwGvrKpvHWnRGdpqhu1dkWRXkl1TU1O9liFJ6r+nAVPAnyT5fJJ3JnkCfRgAsa+XtJj0FIyTHE8nFL+vqj7cmh9KsrzNXw4caO2TwKqu1VcCD07fZlVtraq1VbV2YmJivvVLkhbuOOA5wNur6hzgO7TDJmbR0wAI2NdLWlx6OStF6Hy9dndVvbVr1g5gY5veCFzX1b4hyQlJzgDWADf3r2RJUp9NApNVdVN7/EE6QXlBAyCStNj0MmJ8HvBS4HlJbmu3i4GrgJ9Oci/w0+0xVbUb2A7cBfwFcGVVHRpI9ZKkBauqLwP7kjyjNV1Apw93AETSknLc0Raoqs8w89dm0Ok8Z1pnC7BlAXVJkobrvwDvS/JY4O+B/0hn8GR7ksuB+4HLoDMAkuTwAMgjOAAi6Rhx1GAsSTr2VdVtwNoZZjkAImnJ8JLQkiRJEgZjSZIkCTAYS5IkSYDBWJIkSQIMxpIkSRJgMJYkSZIAg7EkSZIEGIwlSZIkwGAsSZIkAUv0ynerN11/xPl7r7pkSJVIkiRpXDhiLEmSJGEwliRJkgCDsSRJkgQYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkiRJwBI9j/HRHOk8x57jWJIk6djkiLEkSZKEwViSJEkCDMaSJEkSYDCWJEmSAIOxJEmSBBiMJUmSJMBgLEmSJAEGY0mSJAkwGEuSJEmAwViSJEkCDMaSJEkSYDCWJEmSAIOxJEmSBBiMJUmSJMBgLEmSJAE9BOMk1yQ5kOTOrrY3JHkgyW3tdnHXvM1J9iS5J8mFgypckiRJ6qdeRozfDVw0Q/vbqursdvsYQJIzgQ3AWW2dq5Ms61exkiRJ0qAcNRhX1aeBr/W4vfXAtVV1sKruA/YA6xZQnyRJkjQUCznG+OVJ7miHWpzc2lYA+7qWmWxtPyDJFUl2Jdk1NTW1gDIkSZKkhZtvMH478HTgbGA/8JbWnhmWrZk2UFVbq2ptVa2dmJiYZxmSJElSfxw3n5Wq6qHD00neAXy0PZwEVnUtuhJ4cN7VLTKrN11/xPl7r7pkSJVIkiRpruY1YpxkedfDFwGHz1ixA9iQ5IQkZwBrgJsXVqIkSZI0eEcdMU7yAeB84NQkk8DrgfOTnE3nMIm9wMsAqmp3ku3AXcAjwJVVdWgglUuSJEl9dNRgXFUvnqH5XUdYfguwZSFFSZIkScPmle8kSZIkDMaSJEkSYDCWJEmSAIOxJEmSBMzzPMZL2dHOVTzfdT3HsSRJ0mg5YixJAiDJsiSfT/LR9viUJDckubfdn9y17OYke5Lck+TC0VUtSf1jMJYkHfYK4O6ux5uAnVW1BtjZHpPkTGADcBZwEXB1kmVDrlWS+s5gLEkiyUrgEuCdXc3rgW1tehtwaVf7tVV1sKruA/YA64ZUqiQNjMFYkgTw+8BrgEe72k6vqv0A7f601r4C2Ne13GRrk6RFzWAsSUtckhcAB6rqll5XmaGtZtn2FUl2Jdk1NTU17xolaRgMxpKk84AXJtkLXAs8L8mfAg8lWQ7Q7g+05SeBVV3rrwQenGnDVbW1qtZW1dqJiYlB1S9JfWEwlqQlrqo2V9XKqlpN50d1f1VVLwF2ABvbYhuB69r0DmBDkhOSnAGsAW4ectmS1Heex1iSNJurgO1JLgfuBy4DqKrdSbYDdwGPAFdW1aHRlSlJ/WEwliR9T1XdCNzYpr8KXDDLcluALUMrTJKGwEMpJEmSJAzGkiRJEmAwliRJkgCDsSRJkgQYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkiRJgMFYkiRJAgzGkiRJEmAwliRJkgCDsSRJkgQYjCVJkiQAjht1AepYven6WeftveqSIVYiSZK0NDliLEmSJGEwliRJkgCDsSRJkgQYjCVJkiTAYCxJkiQBPQTjJNckOZDkzq62U5LckOTedn9y17zNSfYkuSfJhYMqXJIkSeqnXkaM3w1cNK1tE7CzqtYAO9tjkpwJbADOautcnWRZ36qVJEmSBuSowbiqPg18bVrzemBbm94GXNrVfm1VHayq+4A9wLr+lCpJkiQNznyPMT69qvYDtPvTWvsKYF/XcpOtTZIkSRpr/f7xXWZoqxkXTK5IsivJrqmpqT6XIUmSJM3NfIPxQ0mWA7T7A619EljVtdxK4MGZNlBVW6tqbVWtnZiYmGcZkiRJUn/MNxjvADa26Y3AdV3tG5KckOQMYA1w88JKlCRJkgbvuKMtkOQDwPnAqUkmgdcDVwHbk1wO3A9cBlBVu5NsB+4CHgGurKpDA6pdkiRJ6pujBuOqevEssy6YZfktwJaFFCVJkiQNm1e+kyRJkjAYS5IkSUAPh1Jo9FZvuv6I8/dedcmQKtF0/m0kSTp2OGIsSZIk4YjxMeFIo5aOWEqSJPXGYLyEeRiAJEnS93kohSRJkoQjxponD9+QJEnHGkeMJUmSJAzGkiRJEmAwliRJkgCDsSRJkgQYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkiRJgMFYkiRJAgzGkiRJEuAloTVmjnSpafBy05IkaXAcMZYkSZJwxFgaqCONgDv6rXGRZBXwHuCHgUeBrVX1B0lOAf4MWA3sBX6+qr7e1tkMXA4cAn61qv5yBKVLUl85YixJegR4VVU9EzgXuDLJmcAmYGdVrQF2tse0eRuAs4CLgKuTLBtJ5ZLURwZjSVriqmp/Vd3aph8G7gZWAOuBbW2xbcClbXo9cG1VHayq+4A9wLqhFi1JA2AwliR9T5LVwDnATcDpVbUfOuEZOK0ttgLY17XaZGubaXtXJNmVZNfU1NTA6pakfvAYYw3d0c48IWk0kpwEfAh4ZVV9K8msi87QVjMtWFVbga0Aa9eunXEZSRoXBmPNygArLR1JjqcTit9XVR9uzQ8lWV5V+5MsBw609klgVdfqK4EHh1etJA2GwfgYd6yFW8/yIPVfOkPD7wLurqq3ds3aAWwErmr313W1vz/JW4GnAGuAm4dXsSQNhsFYknQe8FLgC0lua22vpROItye5HLgfuAygqnYn2Q7cReeMFldW1aGhVy1JfWYwVt8da6PU0rGuqj7DzMcNA1wwyzpbgC0DK0qSRsCzUkiSJEkYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkiRJwALPSpFkL/AwcAh4pKrWJjkF+DNgNbAX+Pmq+vrCypQW5mhnyvAcyJIkqR+na/u3VfWVrsebgJ1VdVWSTe3xr/fheaSR8PRzkiQtDYM4j/F64Pw2vQ24EYOxhsAAK0mSFmKhxxgX8IkktyS5orWdXlX7Adr9aTOtmOSKJLuS7JqamlpgGZIkSdLCLHTE+LyqejDJacANSb7Y64pVtRXYCrB27dpaYB3SgjjaLEmSFjRiXFUPtvsDwEeAdcBDSZYDtPsDCy1SkiRJGrR5jxgneQLwmKp6uE3/DPBbwA5gI3BVu7+uH4VKS4ln0ZAkafgWcijF6cBHkhzezvur6i+SfA7YnuRy4H7gsoWXKUmSJA3WvINxVf098OwZ2r8KXLCQoiRJkqRh88p3kiRJEgZjSZIkCRjMBT4k9WAUp4jzR32SJM3OEWNJkiQJg7EkSZIEGIwlSZIkwGAsSZIkAf74TjrmjOJHfZIkHQsMxtIiZPiVJKn/DMaSenKkMO5p3iRJxwKDsaTv8dzKkqSlzB/fSZIkSRiMJUmSJMBgLEmSJAEeYyypDwZ5nLA/+pMkDYsjxpIkSRIGY0mSJAkwGEuSJEmAxxhLGgKv1CdJWgwcMZYkSZIwGEuSJEmAwViSJEkCDMaSJEkS4I/vJC1iXvxDktRPjhhLksaaZzWRNCwGY0nSomFIljRIBmNJ0qJiOJY0KAZjSZIkCYOxJGkRcJRY0jAYjCVJi4LhWNKgGYwlSYuWYVlSPxmMJUmSJAzGkiRJEjDAYJzkoiT3JNmTZNOgnkeSNBqj7Oe7D6FYvel6D6mQ1BcDCcZJlgF/BPwscCbw4iRnDuK5JEnDNw79vGFYUr8NasR4HbCnqv6+qv4JuBZYP6DnkiQN31D6+fmEX0eQJc3XoILxCmBf1+PJ1iZJOjaMXT8/PRAbjiXN1XED2m5maKt/tkByBXBFe/jtJPfM43lOBb4yj/UGaRxrAuuaK+vq3TjWRN40r7qeOohajlFH7eehL339gt5fedN815yTsfw30MX6Fm7caxz3+mC8apy1rx9UMJ4EVnU9Xgk82L1AVW0Fti7kSZLsqqq1C9lGv41jTWBdc2VdvRvHmmB86zqGHLWfh4X39Yvh7zjuNVrfwo17jeNeHyyOGmFwh1J8DliT5IwkjwU2ADsG9FySpOGzn5d0zBnIiHFVPZLk5cBfAsuAa6pq9yCeS5I0fPbzko5FgzqUgqr6GPCxQW2/WdChGAMyjjWBdc2VdfVuHGuC8a3rmLGE+/npxr1G61u4ca9x3OuDxVEjqfqB30pIkiRJS46XhJYkSZJYJME4yTVJDiS5s6vtDUkeSHJbu108grpWJflUkruT7E7yitZ+SpIbktzb7k8ek7pGts+SnJjk5iS3t5re2NpHva9mq2vk769Wx7Ikn0/y0fZ4pPtrlprGZV/tTfKFVsOu1jby/aX5GeXlpmf5zJn1vZRkc6vzniQXdrX/WHtP7knyh0lmOsXdfOqb82fPMGucT38/7H3Ytf2e+9hR1DjXfm0E78UnJ/lgki+29+Nzx6m+eamqsb8BPwk8B7izq+0NwKtHXNdy4Dlt+onA39G5NOrvApta+ybgTWNS18j2GZ1znp7Upo8HbgLOHYN9NVtdI39/tZr+K/B+4KPt8Uj31yw1jcu+2gucOq1t5PvL27z+lsuALwFPAx4L3A6cOcTnn+kzZ8b3UutbbwdOAM5odS9r824Gntv6mY8DP9un+ub02TPsGufa349iH3bV2lMfO6oa59Kvjei9uA34T236scCTx6m++dwWxYhxVX0a+Nqo65iuqvZX1a1t+mHgbjpXflpP581Cu790TOoamer4dnt4fLsVo99Xs9U1cklWApcA7+xqHun+mqWmcTbS/aV5G8rlpmczy2fObO+l9cC1VXWwqu4D9gDrkiwHnlRVn63OJ/976NP7bx6fPUOtcR79/dD3Icy5jx1JjbMYixqTPInOfyLfBVBV/1RV3xiX+uZrUQTjI3h5kjva114j/Yo0yWrgHDr/Mz69qvZDpwMDThuTumCE+6x9ZXUbcAC4oarGYl/NUheM/v31+8BrgEe72ka9v2aqCUa/r6DzwfuJJLekc7U1GP3+0vyM3eWmmf29NFutK9r09Pa+6vGzZ+g1zrG/H9U+/H1672NHVeNc+rVh1/g0YAr4k3Y4yjuTPGGM6puXxRyM3w48HTgb2A+8ZVSFJDkJ+BDwyqr61qjqmG6Guka6z6rqUFWdTecKWeuSPGuYzz+bWeoa6b5K8gLgQFXdMsznPZIj1DQu/xbPq6rnAD8LXJnkJ0dUhxaup8tNj4nZah34a5jDZ8/Qa5xjfz/0+ubRx47q7zyXfm3YNR5H55Cjt1fVOcB36Bw6MZuR/VuZi0UbjKvqofYP71HgHXS+ehu6JMfT6ZjeV1Ufbs0Pta8GaPcHxqGucdln7auWG4GLGIN9NVNdY7CvzgNemGQvna+Rn5fkTxnt/pqxpjHYVwBU1YPt/gDwkVbH2Ly/NCc9XW56yGZ7L81W62Sbnt7eF3P87BlJjdBzfz+K+ubax45kH86xXxt2jZPAZNe3rB+kE5THpb55WbTB+PBOb14E3DnbsgOsIXSOrbm7qt7aNWsHsLFNbwSuG4e6RrnPkkwkeXKbfhzwfOCLjH5fzVjXqN9fVbW5qlZW1Wo6l9r9q6p6CSPcX7PVNOp9BZDkCUmeeHga+JlWx0jfX5q3cbzc9GzvpR3AhiQnJDkDWAPc3L5CfjjJua1P/kX69P6bx2fPUGucR38/9H04jz52FH/nufZrQ62xqr4M7EvyjNZ0AXDXuNQ3bzWiX/3N5QZ8gM5XtN+l8z+Ly4H3Al8A7qCzs5ePoK6foDPcfwdwW7tdDPwLYCdwb7s/ZUzqGtk+A/418Pn23HcCv9naR72vZqtr5O+vrhrP5/u/mB7p/pqlppHvKzrHut3ebruB143T/vI2r7/pxXTOtvClw3/PIT73TJ85s76XgNe1Ou+h69f0wNrWr3wJ+B+0i2r1ob45f/YMs8b59PfD3ofT6u2pjx3B33nO/doIajwb2NX+1v8bOHmc6pvPzSvfSZIkSSziQykkSZKkfjIYS5IkSRiMJUmSJMBgLEmSJAEGY0mSJAkwGEuSJEmAwViSJEkCDMaSJEkSAP8f3ZL2ThW3dcYAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 864x432 with 2 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "display_dataset(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "326\n"
     ]
    }
   ],
   "source": [
    "filtered_cids = []\n",
    "for cid in list(set(df[\"case:concept:name\"].values)):\n",
    "    this_df = df[df[\"case:concept:name\"]==cid]\n",
    "    true_value = this_df.iloc[0]['total_time_true']\n",
    "    if true_value >= 30 and true_value <= 60:\n",
    "        if this_df.shape[0] >= 15 and this_df.shape[0] <= 50:\n",
    "            filtered_cids.append(cid)\n",
    "print(len(filtered_cids))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "200 50\n",
      "(7062, 14)\n"
     ]
    }
   ],
   "source": [
    "df_samples, train_cids, test_cids = sample_data(df, filtered_cids, 200, 50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "ERROR:root:[Errno 2] No such file or directory: '/Users/wangnaixuan/Documents/GitHub/prediction_service/data/drl_data/bpic2019/bpic2019_sample250.csv': read sample file failed.\n"
     ]
    }
   ],
   "source": [
    "from config import DeepReinforceLearningParameters\n",
    "\n",
    "save_path = DeepReinforceLearningParameters(dataset_name=\"bpic2019\").DATA_PATH\n",
    "\n",
    "df_samples.to_csv(save_path+\"/bpic2019_sample250.csv\", index=False)\n",
    "np.save(save_path+\"/train_case_ids.npy\", np.array(train_cids))\n",
    "np.save(save_path+\"/test_case_ids.npy\", np.array(test_cids))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### case 2: 读取case id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "train_cids = np.load(\"../data/drl_data/bpic2019/train_case_ids.npy\")\n",
    "test_cids = np.load(\"../data/drl_data/bpic2019/test_case_ids.npy\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(7062, 14)\n"
     ]
    }
   ],
   "source": [
    "df_sample = pd.DataFrame()\n",
    "for cid in set(df[\"case:concept:name\"]):\n",
    "    if cid in train_cids or cid in test_cids:\n",
    "        df_sample = df_sample.append(df[df[\"case:concept:name\"] == cid])\n",
    "print(df_sample.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "from config import DRLParameters\n",
    "\n",
    "save_path = DRLParameters(dataset_name=\"bpic2019\").DATA_PATH\n",
    "\n",
    "df_sample.to_csv(save_path+\"/bpic2019_nn_sample250.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "nash",
   "language": "python",
   "name": "nash"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
